From 2b7294d5bb8a575a4b73d037f021ae778dd8c337 Mon Sep 17 00:00:00 2001
From: Cyberhan123 <255542417@qq.com>
Date: Tue, 23 Jan 2024 18:02:35 +0800
Subject: [PATCH 1/8] feat: export more api for custom

---
 .clang-format         |    1 -
 examples/cli/main.cpp |  165 +++---
 model.cpp             |  544 ++++++++++---------
 model.h               |    2 +-
 stable-diffusion.cpp  | 1180 ++++++++++++++++++++++++++---------------
 stable-diffusion.h    |   82 ++-
 6 files changed, 1209 insertions(+), 765 deletions(-)

diff --git a/.clang-format b/.clang-format
index 4fe720b8..37881bfc 100644
--- a/.clang-format
+++ b/.clang-format
@@ -3,7 +3,6 @@ UseTab: Never
 IndentWidth: 4
 TabWidth: 4
 AllowShortIfStatementsOnASingleLine: false
-IndentCaseLabels: false
 ColumnLimit: 0
 AccessModifierOffset: -4
 NamespaceIndentation: All
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index af2c337d..bde19f34 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -9,40 +9,42 @@
 #include "stable-diffusion.h"
 
 #define STB_IMAGE_IMPLEMENTATION
+
 #include "stb_image.h"
 
 #define STB_IMAGE_WRITE_IMPLEMENTATION
 #define STB_IMAGE_WRITE_STATIC
+
 #include "stb_image_write.h"
 
-const char* rng_type_to_str[] = {
-    "std_default",
-    "cuda",
+const char *rng_type_to_str[] = {
+        "std_default",
+        "cuda",
 };
 
 // Names of the sampler method, same order as enum sample_method in stable-diffusion.h
-const char* sample_method_str[] = {
-    "euler_a",
-    "euler",
-    "heun",
-    "dpm2",
-    "dpm++2s_a",
-    "dpm++2m",
-    "dpm++2mv2",
-    "lcm",
+const char *sample_method_str[] = {
+        "euler_a",
+        "euler",
+        "heun",
+        "dpm2",
+        "dpm++2s_a",
+        "dpm++2m",
+        "dpm++2mv2",
+        "lcm",
 };
 
 // Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
-const char* schedule_str[] = {
-    "default",
-    "discrete",
-    "karras",
+const char *schedule_str[] = {
+        "default",
+        "discrete",
+        "karras",
 };
 
-const char* modes_str[] = {
-    "txt2img",
-    "img2img",
-    "convert",
+const char *modes_str[] = {
+        "txt2img",
+        "img2img",
+        "convert",
 };
 
 enum SDMode {
@@ -54,7 +56,7 @@ enum SDMode {
 
 struct SDParams {
     int n_threads = -1;
-    SDMode mode   = TXT2IMG;
+    SDMode mode = TXT2IMG;
 
     std::string model_path;
     std::string vae_path;
@@ -68,22 +70,22 @@ struct SDParams {
     std::string prompt;
     std::string negative_prompt;
     float cfg_scale = 7.0f;
-    int clip_skip   = -1;  // <= 0 represents unspecified
-    int width       = 512;
-    int height      = 512;
+    int clip_skip = -1;  // <= 0 represents unspecified
+    int width = 512;
+    int height = 512;
     int batch_count = 1;
 
     sample_method_t sample_method = EULER_A;
-    schedule_t schedule           = DEFAULT;
-    int sample_steps              = 20;
-    float strength                = 0.75f;
-    rng_type_t rng_type           = CUDA_RNG;
-    int64_t seed                  = 42;
-    bool verbose                  = false;
-    bool vae_tiling               = false;
+    schedule_t schedule = DEFAULT;
+    int sample_steps = 20;
+    float strength = 0.75f;
+    rng_type_t rng_type = CUDA_RNG;
+    int64_t seed = 42;
+    bool verbose = false;
+    bool vae_tiling = false;
 };
 
-static std::string sd_basename(const std::string& path) {
+static std::string sd_basename(const std::string &path) {
     size_t pos = path.find_last_of('/');
     if (pos != std::string::npos) {
         return path.substr(pos + 1);
@@ -122,7 +124,7 @@ void print_params(SDParams params) {
     printf("    vae_tiling:        %s\n", params.vae_tiling ? "true" : "false");
 }
 
-void print_usage(int argc, const char* argv[]) {
+void print_usage(int argc, const char *argv[]) {
     printf("usage: %s [arguments]\n", argv[0]);
     printf("\n");
     printf("arguments:\n");
@@ -159,7 +161,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("  -v, --verbose                      print extra info\n");
 }
 
-void parse_args(int argc, const char** argv, SDParams& params) {
+void parse_args(int argc, const char **argv, SDParams &params) {
     bool invalid_arg = false;
     std::string arg;
     for (int i = 1; i < argc; i++) {
@@ -176,8 +178,8 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 invalid_arg = true;
                 break;
             }
-            const char* mode_selected = argv[i];
-            int mode_found            = -1;
+            const char *mode_selected = argv[i];
+            int mode_found = -1;
             for (int d = 0; d < MODE_COUNT; d++) {
                 if (!strcmp(mode_selected, modes_str[d])) {
                     mode_found = d;
@@ -188,7 +190,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                         mode_selected);
                 exit(1);
             }
-            params.mode = (SDMode)mode_found;
+            params.mode = (SDMode) mode_found;
         } else if (arg == "-m" || arg == "--model") {
             if (++i >= argc) {
                 invalid_arg = true;
@@ -234,7 +236,8 @@ void parse_args(int argc, const char** argv, SDParams& params) {
             } else if (type == "q8_0") {
                 params.wtype = SD_TYPE_Q8_0;
             } else {
-                fprintf(stderr, "error: invalid weight format %s, must be one of [f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0]\n",
+                fprintf(stderr,
+                        "error: invalid weight format %s, must be one of [f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0]\n",
                         type.c_str());
                 exit(1);
             }
@@ -331,8 +334,8 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 invalid_arg = true;
                 break;
             }
-            const char* schedule_selected = argv[i];
-            int schedule_found            = -1;
+            const char *schedule_selected = argv[i];
+            int schedule_found = -1;
             for (int d = 0; d < N_SCHEDULES; d++) {
                 if (!strcmp(schedule_selected, schedule_str[d])) {
                     schedule_found = d;
@@ -342,7 +345,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 invalid_arg = true;
                 break;
             }
-            params.schedule = (schedule_t)schedule_found;
+            params.schedule = (schedule_t) schedule_found;
         } else if (arg == "-s" || arg == "--seed") {
             if (++i >= argc) {
                 invalid_arg = true;
@@ -354,8 +357,8 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 invalid_arg = true;
                 break;
             }
-            const char* sample_method_selected = argv[i];
-            int sample_method_found            = -1;
+            const char *sample_method_selected = argv[i];
+            int sample_method_found = -1;
             for (int m = 0; m < N_SAMPLE_METHODS; m++) {
                 if (!strcmp(sample_method_selected, sample_method_str[m])) {
                     sample_method_found = m;
@@ -365,7 +368,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 invalid_arg = true;
                 break;
             }
-            params.sample_method = (sample_method_t)sample_method_found;
+            params.sample_method = (sample_method_t) sample_method_found;
         } else if (arg == "-h" || arg == "--help") {
             print_usage(argc, argv);
             exit(0);
@@ -431,7 +434,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
     }
 
     if (params.seed < 0) {
-        srand((int)time(NULL));
+        srand((int) time(NULL));
         params.seed = rand();
     }
 
@@ -462,8 +465,8 @@ std::string get_image_params(SDParams params, int64_t seed) {
     return parameter_string;
 }
 
-void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
-    SDParams* params = (SDParams*)data;
+void sd_log_cb(enum sd_log_level_t level, const char *log, void *data) {
+    SDParams *params = (SDParams *) data;
     if (!params->verbose && level <= SD_LOG_DEBUG) {
         return;
     }
@@ -476,11 +479,11 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
     }
 }
 
-int main(int argc, const char* argv[]) {
+int main(int argc, const char *argv[]) {
     SDParams params;
     parse_args(argc, argv, params);
 
-    sd_set_log_callback(sd_log_cb, (void*)&params);
+    sd_set_log_callback(sd_log_cb, (void *) &params);
 
     if (params.verbose) {
         print_params(params);
@@ -488,7 +491,10 @@ int main(int argc, const char* argv[]) {
     }
 
     if (params.mode == CONVERT) {
-        bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype);
+        bool success = convert(params.model_path.c_str(),
+                               params.vae_path.c_str(),
+                               params.output_path.c_str(),
+                               params.wtype);
         if (!success) {
             fprintf(stderr,
                     "convert '%s'/'%s' to '%s' failed\n",
@@ -505,12 +511,12 @@ int main(int argc, const char* argv[]) {
         }
     }
 
-    bool vae_decode_only        = true;
-    uint8_t* input_image_buffer = NULL;
+    bool vae_decode_only = true;
+    uint8_t *input_image_buffer = NULL;
     if (params.mode == IMG2IMG) {
         vae_decode_only = false;
 
-        int c              = 0;
+        int c = 0;
         input_image_buffer = stbi_load(params.input_path.c_str(), &params.width, &params.height, &c, 3);
         if (input_image_buffer == NULL) {
             fprintf(stderr, "load image from '%s' failed\n", params.input_path.c_str());
@@ -533,24 +539,45 @@ int main(int argc, const char* argv[]) {
         }
     }
 
-    sd_ctx_t* sd_ctx = new_sd_ctx(params.model_path.c_str(),
-                                  params.vae_path.c_str(),
-                                  params.taesd_path.c_str(),
-                                  params.lora_model_dir.c_str(),
-                                  vae_decode_only,
-                                  params.vae_tiling,
-                                  true,
-                                  params.n_threads,
-                                  params.wtype,
-                                  params.rng_type,
-                                  params.schedule);
+    sd_ctx_t *sd_ctx = new_sd_ctx(
+            params.n_threads,
+            vae_decode_only,
+            true,
+            params.lora_model_dir.c_str(),
+            params.rng_type,
+            params.vae_tiling,
+            params.wtype,
+            params.schedule,
+            true
+    );
 
     if (sd_ctx == NULL) {
         printf("new_sd_ctx_t failed\n");
         return 1;
     }
 
-    sd_image_t* results;
+    if (!load_diffusions_from_file(sd_ctx, params.model_path.c_str())) {
+        printf("load diffusions model failed\n");
+        return 1;
+    }
+
+    if (!params.taesd_path.empty()) {
+        free_unet_params(sd_ctx);
+        if (!load_taesd_from_file(sd_ctx, params.taesd_path.c_str())) {
+            printf("load taesd model failed\n");
+            return 1;
+        }
+    }
+
+    if (!params.vae_path.empty()) {
+        free_vae_params(sd_ctx);
+        if (!load_vae_from_file(sd_ctx, params.vae_path.c_str())) {
+            printf("load vae model failed\n");
+            return 1;
+        }
+    }
+
+    sd_image_t *results;
     if (params.mode == TXT2IMG) {
         results = txt2img(sd_ctx,
                           params.prompt.c_str(),
@@ -564,8 +591,8 @@ int main(int argc, const char* argv[]) {
                           params.seed,
                           params.batch_count);
     } else {
-        sd_image_t input_image = {(uint32_t)params.width,
-                                  (uint32_t)params.height,
+        sd_image_t input_image = {(uint32_t) params.width,
+                                  (uint32_t) params.height,
                                   3,
                                   input_image_buffer};
 
@@ -592,7 +619,7 @@ int main(int argc, const char* argv[]) {
 
     int upscale_factor = 4;  // unused for RealESRGAN_x4plus_anime_6B.pth
     if (params.esrgan_path.size() > 0) {
-        upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(),
+        upscaler_ctx_t *upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(),
                                                         params.n_threads,
                                                         params.wtype);
 
@@ -614,7 +641,7 @@ int main(int argc, const char* argv[]) {
         }
     }
 
-    size_t last            = params.output_path.find_last_of(".");
+    size_t last = params.output_path.find_last_of(".");
     std::string dummy_name = last != std::string::npos ? params.output_path.substr(0, last) : params.output_path;
     for (int i = 0; i < params.batch_count; i++) {
         if (results[i].data == NULL) {
diff --git a/model.cpp b/model.cpp
index 387a9cf5..60f4dda7 100644
--- a/model.cpp
+++ b/model.cpp
@@ -23,7 +23,7 @@
 
 #define ST_HEADER_SIZE_LEN 8
 
-uint64_t read_u64(uint8_t* buffer) {
+uint64_t read_u64(uint8_t *buffer) {
     // little endian
     uint64_t value = 0;
     value |= static_cast<int64_t>(buffer[7]) << 56;
@@ -37,7 +37,7 @@ uint64_t read_u64(uint8_t* buffer) {
     return value;
 }
 
-int32_t read_int(uint8_t* buffer) {
+int32_t read_int(uint8_t *buffer) {
     // little endian
     int value = 0;
     value |= buffer[3] << 24;
@@ -47,7 +47,7 @@ int32_t read_int(uint8_t* buffer) {
     return value;
 }
 
-uint16_t read_short(uint8_t* buffer) {
+uint16_t read_short(uint8_t *buffer) {
     // little endian
     uint16_t value = 0;
     value |= buffer[1] << 8;
@@ -58,44 +58,44 @@ uint16_t read_short(uint8_t* buffer) {
 /*================================================= Preprocess ==================================================*/
 
 std::string self_attn_names[] = {
-    "self_attn.q_proj.weight",
-    "self_attn.k_proj.weight",
-    "self_attn.v_proj.weight",
-    "self_attn.q_proj.bias",
-    "self_attn.k_proj.bias",
-    "self_attn.v_proj.bias",
+        "self_attn.q_proj.weight",
+        "self_attn.k_proj.weight",
+        "self_attn.v_proj.weight",
+        "self_attn.q_proj.bias",
+        "self_attn.k_proj.bias",
+        "self_attn.v_proj.bias",
 };
 
-const char* unused_tensors[] = {
-    "betas",
-    "alphas_cumprod_prev",
-    "sqrt_alphas_cumprod",
-    "sqrt_one_minus_alphas_cumprod",
-    "log_one_minus_alphas_cumprod",
-    "sqrt_recip_alphas_cumprod",
-    "sqrt_recipm1_alphas_cumprod",
-    "posterior_variance",
-    "posterior_log_variance_clipped",
-    "posterior_mean_coef1",
-    "posterior_mean_coef2",
-    "cond_stage_model.transformer.text_model.embeddings.position_ids",
-    "cond_stage_model.model.logit_scale",
-    "cond_stage_model.model.text_projection",
-    "conditioner.embedders.0.transformer.text_model.embeddings.position_ids",
-    "conditioner.embedders.0.model.logit_scale",
-    "conditioner.embedders.1.model.logit_scale",
-    "model.diffusion_model.time_embedding.cond_proj.weight",
-    "unet.time_embedding.cond_proj.weight",
-    "model_ema.decay",
-    "model_ema.num_updates",
-    "model_ema.diffusion_model",
-    "control_model",
-    "embedding_manager",
-    "denoiser.sigmas",
+const char *unused_tensors[] = {
+        "betas",
+        "alphas_cumprod_prev",
+        "sqrt_alphas_cumprod",
+        "sqrt_one_minus_alphas_cumprod",
+        "log_one_minus_alphas_cumprod",
+        "sqrt_recip_alphas_cumprod",
+        "sqrt_recipm1_alphas_cumprod",
+        "posterior_variance",
+        "posterior_log_variance_clipped",
+        "posterior_mean_coef1",
+        "posterior_mean_coef2",
+        "cond_stage_model.transformer.text_model.embeddings.position_ids",
+        "cond_stage_model.model.logit_scale",
+        "cond_stage_model.model.text_projection",
+        "conditioner.embedders.0.transformer.text_model.embeddings.position_ids",
+        "conditioner.embedders.0.model.logit_scale",
+        "conditioner.embedders.1.model.logit_scale",
+        "model.diffusion_model.time_embedding.cond_proj.weight",
+        "unet.time_embedding.cond_proj.weight",
+        "model_ema.decay",
+        "model_ema.num_updates",
+        "model_ema.diffusion_model",
+        "control_model",
+        "embedding_manager",
+        "denoiser.sigmas",
 };
 
 bool is_unused_tensor(std::string name) {
-    for (int i = 0; i < sizeof(unused_tensors) / sizeof(const char*); i++) {
+    for (int i = 0; i < sizeof(unused_tensors) / sizeof(const char *); i++) {
         if (starts_with(name, unused_tensors[i])) {
             return true;
         }
@@ -104,54 +104,54 @@ bool is_unused_tensor(std::string name) {
 }
 
 std::unordered_map<std::string, std::string> open_clip_to_hf_clip_model = {
-    {"model.ln_final.bias", "transformer.text_model.final_layer_norm.bias"},
-    {"model.ln_final.weight", "transformer.text_model.final_layer_norm.weight"},
-    {"model.positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"},
-    {"model.token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"},
-    {"model.text_projection", "transformer.text_model.text_projection"},
+        {"model.ln_final.bias",          "transformer.text_model.final_layer_norm.bias"},
+        {"model.ln_final.weight",        "transformer.text_model.final_layer_norm.weight"},
+        {"model.positional_embedding",   "transformer.text_model.embeddings.position_embedding.weight"},
+        {"model.token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"},
+        {"model.text_projection",        "transformer.text_model.text_projection"},
 };
 
 std::unordered_map<std::string, std::string> open_clip_to_hk_clip_resblock = {
-    {"attn.out_proj.bias", "self_attn.out_proj.bias"},
-    {"attn.out_proj.weight", "self_attn.out_proj.weight"},
-    {"ln_1.bias", "layer_norm1.bias"},
-    {"ln_1.weight", "layer_norm1.weight"},
-    {"ln_2.bias", "layer_norm2.bias"},
-    {"ln_2.weight", "layer_norm2.weight"},
-    {"mlp.c_fc.bias", "mlp.fc1.bias"},
-    {"mlp.c_fc.weight", "mlp.fc1.weight"},
-    {"mlp.c_proj.bias", "mlp.fc2.bias"},
-    {"mlp.c_proj.weight", "mlp.fc2.weight"},
+        {"attn.out_proj.bias",   "self_attn.out_proj.bias"},
+        {"attn.out_proj.weight", "self_attn.out_proj.weight"},
+        {"ln_1.bias",            "layer_norm1.bias"},
+        {"ln_1.weight",          "layer_norm1.weight"},
+        {"ln_2.bias",            "layer_norm2.bias"},
+        {"ln_2.weight",          "layer_norm2.weight"},
+        {"mlp.c_fc.bias",        "mlp.fc1.bias"},
+        {"mlp.c_fc.weight",      "mlp.fc1.weight"},
+        {"mlp.c_proj.bias",      "mlp.fc2.bias"},
+        {"mlp.c_proj.weight",    "mlp.fc2.weight"},
 };
 
 std::unordered_map<std::string, std::string> vae_decoder_name_map = {
-    {"first_stage_model.decoder.mid.attn_1.to_k.bias", "first_stage_model.decoder.mid.attn_1.k.bias"},
-    {"first_stage_model.decoder.mid.attn_1.to_k.weight", "first_stage_model.decoder.mid.attn_1.k.weight"},
-    {"first_stage_model.decoder.mid.attn_1.to_out.0.bias", "first_stage_model.decoder.mid.attn_1.proj_out.bias"},
-    {"first_stage_model.decoder.mid.attn_1.to_out.0.weight", "first_stage_model.decoder.mid.attn_1.proj_out.weight"},
-    {"first_stage_model.decoder.mid.attn_1.to_q.bias", "first_stage_model.decoder.mid.attn_1.q.bias"},
-    {"first_stage_model.decoder.mid.attn_1.to_q.weight", "first_stage_model.decoder.mid.attn_1.q.weight"},
-    {"first_stage_model.decoder.mid.attn_1.to_v.bias", "first_stage_model.decoder.mid.attn_1.v.bias"},
-    {"first_stage_model.decoder.mid.attn_1.to_v.weight", "first_stage_model.decoder.mid.attn_1.v.weight"},
+        {"first_stage_model.decoder.mid.attn_1.to_k.bias",       "first_stage_model.decoder.mid.attn_1.k.bias"},
+        {"first_stage_model.decoder.mid.attn_1.to_k.weight",     "first_stage_model.decoder.mid.attn_1.k.weight"},
+        {"first_stage_model.decoder.mid.attn_1.to_out.0.bias",   "first_stage_model.decoder.mid.attn_1.proj_out.bias"},
+        {"first_stage_model.decoder.mid.attn_1.to_out.0.weight", "first_stage_model.decoder.mid.attn_1.proj_out.weight"},
+        {"first_stage_model.decoder.mid.attn_1.to_q.bias",       "first_stage_model.decoder.mid.attn_1.q.bias"},
+        {"first_stage_model.decoder.mid.attn_1.to_q.weight",     "first_stage_model.decoder.mid.attn_1.q.weight"},
+        {"first_stage_model.decoder.mid.attn_1.to_v.bias",       "first_stage_model.decoder.mid.attn_1.v.bias"},
+        {"first_stage_model.decoder.mid.attn_1.to_v.weight",     "first_stage_model.decoder.mid.attn_1.v.weight"},
 };
 
-std::string convert_open_clip_to_hf_clip(const std::string& name) {
+std::string convert_open_clip_to_hf_clip(const std::string &name) {
     std::string new_name = name;
     std::string prefix;
     if (starts_with(new_name, "conditioner.embedders.0.")) {
-        prefix   = "cond_stage_model.";
+        prefix = "cond_stage_model.";
         new_name = new_name.substr(strlen("conditioner.embedders.0."));
     } else if (starts_with(new_name, "conditioner.embedders.1.")) {
-        prefix   = "cond_stage_model.1.";
+        prefix = "cond_stage_model.1.";
         new_name = new_name.substr(strlen("conditioner.embedders.0."));
     } else if (starts_with(new_name, "cond_stage_model.")) {
-        prefix   = "cond_stage_model.";
+        prefix = "cond_stage_model.";
         new_name = new_name.substr(strlen("cond_stage_model."));
     } else {
         return new_name;
     }
     std::string open_clip_resblock_prefix = "model.transformer.resblocks.";
-    std::string hf_clip_resblock_prefix   = "transformer.text_model.encoder.layers.";
+    std::string hf_clip_resblock_prefix = "transformer.text_model.encoder.layers.";
 
     if (open_clip_to_hf_clip_model.find(new_name) != open_clip_to_hf_clip_model.end()) {
         new_name = open_clip_to_hf_clip_model[new_name];
@@ -159,21 +159,21 @@ std::string convert_open_clip_to_hf_clip(const std::string& name) {
 
     if (new_name.find(open_clip_resblock_prefix) == 0) {
         std::string remain = new_name.substr(open_clip_resblock_prefix.length());
-        std::string idx    = remain.substr(0, remain.find("."));
+        std::string idx = remain.substr(0, remain.find("."));
         std::string suffix = remain.substr(idx.length() + 1);
 
         if (suffix == "attn.in_proj_weight" || suffix == "attn.in_proj_bias") {
             new_name = hf_clip_resblock_prefix + idx + "." + suffix;
         } else if (open_clip_to_hk_clip_resblock.find(suffix) != open_clip_to_hk_clip_resblock.end()) {
             std::string new_suffix = open_clip_to_hk_clip_resblock[suffix];
-            new_name               = hf_clip_resblock_prefix + idx + "." + new_suffix;
+            new_name = hf_clip_resblock_prefix + idx + "." + new_suffix;
         }
     }
 
     return prefix + new_name;
 }
 
-std::string convert_vae_decoder_name(const std::string& name) {
+std::string convert_vae_decoder_name(const std::string &name) {
     if (vae_decoder_name_map.find(name) != vae_decoder_name_map.end()) {
         return vae_decoder_name_map[name];
     }
@@ -181,57 +181,57 @@ std::string convert_vae_decoder_name(const std::string& name) {
 }
 
 std::unordered_map<std::string, std::unordered_map<std::string, std::string>> suffix_conversion_underline = {
-    {
-        "attentions",
         {
-            {"to_k", "k"},
-            {"to_q", "q"},
-            {"to_v", "v"},
-            {"to_out_0", "proj_out"},
-            {"group_norm", "norm"},
+                "attentions",
+                {
+                        {"to_k",  "k"},
+                        {"to_q",  "q"},
+                        {"to_v",  "v"},
+                        {"to_out_0", "proj_out"},
+                        {"group_norm",    "norm"},
+                },
         },
-    },
-    {
-        "resnets",
         {
-            {"conv1", "in_layers_2"},
-            {"conv2", "out_layers_3"},
-            {"norm1", "in_layers_0"},
-            {"norm2", "out_layers_0"},
-            {"time_emb_proj", "emb_layers_1"},
-            {"conv_shortcut", "skip_connection"},
+                "resnets",
+                {
+                        {"conv1", "in_layers_2"},
+                        {"conv2", "out_layers_3"},
+                        {"norm1", "in_layers_0"},
+                        {"norm2",    "out_layers_0"},
+                        {"time_emb_proj", "emb_layers_1"},
+                        {"conv_shortcut", "skip_connection"},
+                },
         },
-    },
 };
 
 std::unordered_map<std::string, std::unordered_map<std::string, std::string>> suffix_conversion_dot = {
-    {
-        "attentions",
         {
-            {"to_k", "k"},
-            {"to_q", "q"},
-            {"to_v", "v"},
-            {"to_out.0", "proj_out"},
-            {"group_norm", "norm"},
+                "attentions",
+                {
+                        {"to_k",  "k"},
+                        {"to_q",  "q"},
+                        {"to_v",  "v"},
+                        {"to_out.0", "proj_out"},
+                        {"group_norm",    "norm"},
+                },
         },
-    },
-    {
-        "resnets",
         {
-            {"conv1", "in_layers.2"},
-            {"conv2", "out_layers.3"},
-            {"norm1", "in_layers.0"},
-            {"norm2", "out_layers.0"},
-            {"time_emb_proj", "emb_layers.1"},
-            {"conv_shortcut", "skip_connection"},
+                "resnets",
+                {
+                        {"conv1", "in_layers.2"},
+                        {"conv2", "out_layers.3"},
+                        {"norm1", "in_layers.0"},
+                        {"norm2",    "out_layers.0"},
+                        {"time_emb_proj", "emb_layers.1"},
+                        {"conv_shortcut", "skip_connection"},
+                },
         },
-    },
 };
 
-std::string convert_diffusers_name_to_compvis(const std::string& key, char seq) {
+std::string convert_diffusers_name_to_compvis(const std::string &key, char seq) {
     std::vector<std::string> m;
 
-    auto match = [](std::vector<std::string>& match_list, const std::regex& regex, const std::string& key) {
+    auto match = [](std::vector<std::string> &match_list, const std::regex &regex, const std::string &key) {
         auto r = std::smatch{};
         if (!std::regex_match(key, r, regex)) {
             return false;
@@ -251,7 +251,7 @@ std::string convert_diffusers_name_to_compvis(const std::string& key, char seq)
         suffix_conversion = suffix_conversion_dot;
     }
 
-    auto get_converted_suffix = [&suffix_conversion](const std::string& outer_key, const std::string& inner_key) {
+    auto get_converted_suffix = [&suffix_conversion](const std::string &outer_key, const std::string &inner_key) {
         auto outer_iter = suffix_conversion.find(outer_key);
         if (outer_iter != suffix_conversion.end()) {
             auto inner_iter = outer_iter->second.find(inner_key);
@@ -276,40 +276,50 @@ std::string convert_diffusers_name_to_compvis(const std::string& key, char seq)
     }
 
     if (match(m, std::regex(format("unet%ctime_embedding%clinear_(\\d+)(.*)", seq, seq)), key)) {
-        return format("model%cdiffusion_model%ctime_embed%c", seq, seq, seq) + std::to_string(std::stoi(m[0]) * 2 - 2) + m[1];
+        return format("model%cdiffusion_model%ctime_embed%c", seq, seq, seq) + std::to_string(std::stoi(m[0]) * 2 - 2) +
+               m[1];
     }
 
-    if (match(m, std::regex(format("unet%cdown_blocks%c(\\d+)%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) {
+    if (match(m, std::regex(
+            format("unet%cdown_blocks%c(\\d+)%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) {
         std::string suffix = get_converted_suffix(m[1], m[3]);
         // LOG_DEBUG("%s %s %s %s", m[0].c_str(), m[1].c_str(), m[2].c_str(), m[3].c_str());
-        return format("model%cdiffusion_model%cinput_blocks%c", seq, seq, seq) + std::to_string(1 + std::stoi(m[0]) * 3 + std::stoi(m[2])) + seq +
+        return format("model%cdiffusion_model%cinput_blocks%c", seq, seq, seq) +
+               std::to_string(1 + std::stoi(m[0]) * 3 + std::stoi(m[2])) + seq +
                (m[1] == "attentions" ? "1" : "0") + seq + suffix;
     }
 
     if (match(m, std::regex(format("unet%cmid_block%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq)), key)) {
         std::string suffix = get_converted_suffix(m[0], m[2]);
-        return format("model%cdiffusion_model%cmiddle_block%c", seq, seq, seq) + (m[0] == "attentions" ? "1" : std::to_string(std::stoi(m[1]) * 2)) +
+        return format("model%cdiffusion_model%cmiddle_block%c", seq, seq, seq) +
+               (m[0] == "attentions" ? "1" : std::to_string(std::stoi(m[1]) * 2)) +
                seq + suffix;
     }
 
-    if (match(m, std::regex(format("unet%cup_blocks%c(\\d+)%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) {
+    if (match(m, std::regex(
+            format("unet%cup_blocks%c(\\d+)%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) {
         std::string suffix = get_converted_suffix(m[1], m[3]);
-        return format("model%cdiffusion_model%coutput_blocks%c", seq, seq, seq) + std::to_string(std::stoi(m[0]) * 3 + std::stoi(m[2])) + seq +
+        return format("model%cdiffusion_model%coutput_blocks%c", seq, seq, seq) +
+               std::to_string(std::stoi(m[0]) * 3 + std::stoi(m[2])) + seq +
                (m[1] == "attentions" ? "1" : "0") + seq + suffix;
     }
 
-    if (match(m, std::regex(format("unet%cdown_blocks%c(\\d+)%cdownsamplers%c0%cconv", seq, seq, seq, seq, seq)), key)) {
-        return format("model%cdiffusion_model%cinput_blocks%c", seq, seq, seq) + std::to_string(3 + std::stoi(m[0]) * 3) + seq + "0" + seq + "op";
+    if (match(m, std::regex(format("unet%cdown_blocks%c(\\d+)%cdownsamplers%c0%cconv", seq, seq, seq, seq, seq)),
+              key)) {
+        return format("model%cdiffusion_model%cinput_blocks%c", seq, seq, seq) +
+               std::to_string(3 + std::stoi(m[0]) * 3) + seq + "0" + seq + "op";
     }
 
     if (match(m, std::regex(format("unet%cup_blocks%c(\\d+)%cupsamplers%c0%cconv", seq, seq, seq, seq, seq)), key)) {
-        return format("model%cdiffusion_model%coutput_blocks%c", seq, seq, seq) + std::to_string(2 + std::stoi(m[0]) * 3) + seq +
+        return format("model%cdiffusion_model%coutput_blocks%c", seq, seq, seq) +
+               std::to_string(2 + std::stoi(m[0]) * 3) + seq +
                (std::stoi(m[0]) > 0 ? "2" : "1") + seq + "conv";
     }
 
     // clip
     if (match(m, std::regex(format("te%ctext_model%cencoder%clayers%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) {
-        return format("cond_stage_model%ctransformer%ctext_model%cencoder%clayers%c", seq, seq, seq, seq, seq) + m[0] + seq + m[1];
+        return format("cond_stage_model%ctransformer%ctext_model%cencoder%clayers%c", seq, seq, seq, seq, seq) + m[0] +
+               seq + m[1];
     }
 
     if (match(m, std::regex(format("te%ctext_model(.*)", seq)), key)) {
@@ -321,21 +331,25 @@ std::string convert_diffusers_name_to_compvis(const std::string& key, char seq)
         return format("first_stage_model%c%s%cnorm_out%s", seq, m[0].c_str(), seq, m[1].c_str());
     }
 
-    if (match(m, std::regex(format("vae%c(.*)%cmid_block%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) {
+    if (match(m,
+              std::regex(format("vae%c(.*)%cmid_block%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)),
+              key)) {
         std::string suffix;
         std::string block_name;
         if (m[1] == "attentions") {
             block_name = "attn";
-            suffix     = get_converted_suffix(m[1], m[3]);
+            suffix = get_converted_suffix(m[1], m[3]);
         } else {
             block_name = "block";
-            suffix     = m[3];
+            suffix = m[3];
         }
         return format("first_stage_model%c%s%cmid%c%s_%d%c%s",
                       seq, m[0].c_str(), seq, seq, block_name.c_str(), std::stoi(m[2]) + 1, seq, suffix.c_str());
     }
 
-    if (match(m, std::regex(format("vae%c(.*)%cup_blocks%c(\\d+)%cresnets%c(\\d+)%c(.+)", seq, seq, seq, seq, seq, seq)), key)) {
+    if (match(m,
+              std::regex(format("vae%c(.*)%cup_blocks%c(\\d+)%cresnets%c(\\d+)%c(.+)", seq, seq, seq, seq, seq, seq)),
+              key)) {
         std::string suffix = m[3];
         if (suffix == "conv_shortcut") {
             suffix = "nin_shortcut";
@@ -344,12 +358,16 @@ std::string convert_diffusers_name_to_compvis(const std::string& key, char seq)
                       seq, m[0].c_str(), seq, seq, 3 - std::stoi(m[1]), seq, seq, m[2].c_str(), seq, suffix.c_str());
     }
 
-    if (match(m, std::regex(format("vae%c(.*)%cdown_blocks%c(\\d+)%cdownsamplers%c0%cconv", seq, seq, seq, seq, seq, seq)), key)) {
+    if (match(m,
+              std::regex(format("vae%c(.*)%cdown_blocks%c(\\d+)%cdownsamplers%c0%cconv", seq, seq, seq, seq, seq, seq)),
+              key)) {
         return format("first_stage_model%c%s%cdown%c%d%cdownsample%cconv",
                       seq, m[0].c_str(), seq, seq, std::stoi(m[1]), seq, seq);
     }
 
-    if (match(m, std::regex(format("vae%c(.*)%cdown_blocks%c(\\d+)%cresnets%c(\\d+)%c(.+)", seq, seq, seq, seq, seq, seq)), key)) {
+    if (match(m,
+              std::regex(format("vae%c(.*)%cdown_blocks%c(\\d+)%cresnets%c(\\d+)%c(.+)", seq, seq, seq, seq, seq, seq)),
+              key)) {
         std::string suffix = m[3];
         if (suffix == "conv_shortcut") {
             suffix = "nin_shortcut";
@@ -358,7 +376,8 @@ std::string convert_diffusers_name_to_compvis(const std::string& key, char seq)
                       seq, m[0].c_str(), seq, seq, std::stoi(m[1]), seq, seq, m[2].c_str(), seq, suffix.c_str());
     }
 
-    if (match(m, std::regex(format("vae%c(.*)%cup_blocks%c(\\d+)%cupsamplers%c0%cconv", seq, seq, seq, seq, seq, seq)), key)) {
+    if (match(m, std::regex(format("vae%c(.*)%cup_blocks%c(\\d+)%cupsamplers%c0%cconv", seq, seq, seq, seq, seq, seq)),
+              key)) {
         return format("first_stage_model%c%s%cup%c%d%cupsample%cconv",
                       seq, m[0].c_str(), seq, seq, 3 - std::stoi(m[1]), seq, seq);
     }
@@ -370,7 +389,7 @@ std::string convert_diffusers_name_to_compvis(const std::string& key, char seq)
     return key;
 }
 
-std::string convert_tensor_name(const std::string& name) {
+std::string convert_tensor_name(const std::string &name) {
     std::string new_name;
     if (starts_with(name, "cond_stage_model.") || starts_with(name, "conditioner.embedders.")) {
         new_name = convert_open_clip_to_hf_clip(name);
@@ -380,7 +399,7 @@ std::string convert_tensor_name(const std::string& name) {
         size_t pos = name.find('.');
         if (pos != std::string::npos) {
             std::string name_without_network_parts = name.substr(5, pos - 5);
-            std::string network_part               = name.substr(pos + 1);
+            std::string network_part = name.substr(pos + 1);
             // LOG_DEBUG("%s %s", name_without_network_parts.c_str(), network_part.c_str());
             std::string new_key = convert_diffusers_name_to_compvis(name_without_network_parts, '_');
             if (new_key.empty()) {
@@ -395,7 +414,7 @@ std::string convert_tensor_name(const std::string& name) {
         size_t pos = name.find_last_of('.');
         if (pos != std::string::npos) {
             std::string name_without_network_parts = name.substr(0, pos);
-            std::string network_part               = name.substr(pos + 1);
+            std::string network_part = name.substr(pos + 1);
             // LOG_DEBUG("%s %s", name_without_network_parts.c_str(), network_part.c_str());
             std::string new_key = convert_diffusers_name_to_compvis(name_without_network_parts, '.');
             if (new_key.empty()) {
@@ -416,7 +435,7 @@ std::string convert_tensor_name(const std::string& name) {
 }
 
 void preprocess_tensor(TensorStorage tensor_storage,
-                       std::vector<TensorStorage>& processed_tensor_storages) {
+                       std::vector<TensorStorage> &processed_tensor_storages) {
     std::vector<TensorStorage> result;
     std::string new_name = convert_tensor_name(tensor_storage.name);
 
@@ -439,9 +458,9 @@ void preprocess_tensor(TensorStorage tensor_storage,
         std::string prefix = new_name.substr(0, prefix_size);
 
         std::vector<TensorStorage> chunks = tensor_storage.chunk(3);
-        chunks[0].name                    = prefix + "self_attn.q_proj.weight";
-        chunks[1].name                    = prefix + "self_attn.k_proj.weight";
-        chunks[2].name                    = prefix + "self_attn.v_proj.weight";
+        chunks[0].name = prefix + "self_attn.q_proj.weight";
+        chunks[1].name = prefix + "self_attn.k_proj.weight";
+        chunks[2].name = prefix + "self_attn.v_proj.weight";
 
         processed_tensor_storages.insert(processed_tensor_storages.end(), chunks.begin(), chunks.end());
 
@@ -451,9 +470,9 @@ void preprocess_tensor(TensorStorage tensor_storage,
         std::string prefix = new_name.substr(0, prefix_size);
 
         std::vector<TensorStorage> chunks = tensor_storage.chunk(3);
-        chunks[0].name                    = prefix + "self_attn.q_proj.bias";
-        chunks[1].name                    = prefix + "self_attn.k_proj.bias";
-        chunks[2].name                    = prefix + "self_attn.v_proj.bias";
+        chunks[0].name = prefix + "self_attn.q_proj.bias";
+        chunks[1].name = prefix + "self_attn.k_proj.bias";
+        chunks[2].name = prefix + "self_attn.v_proj.bias";
 
         processed_tensor_storages.insert(processed_tensor_storages.end(), chunks.begin(), chunks.end());
     } else {
@@ -463,37 +482,38 @@ void preprocess_tensor(TensorStorage tensor_storage,
 
 float bf16_to_f32(uint16_t bfloat16) {
     uint32_t val_bits = (static_cast<uint32_t>(bfloat16) << 16);
-    return *reinterpret_cast<float*>(&val_bits);
+    return *reinterpret_cast<float *>(&val_bits);
 }
 
-void bf16_to_f32_vec(uint16_t* src, float* dst, int64_t n) {
+void bf16_to_f32_vec(uint16_t *src, float *dst, int64_t n) {
     // support inplace op
     for (int64_t i = n - 1; i >= 0; i--) {
         dst[i] = bf16_to_f32(src[i]);
     }
 }
 
-void convert_tensor(void* src, ggml_type src_type, void* dst, ggml_type dst_type, int n) {
+void convert_tensor(void *src, ggml_type src_type, void *dst, ggml_type dst_type, int n) {
     if (src_type == dst_type) {
         size_t nbytes = n * ggml_type_size(src_type) / ggml_blck_size(src_type);
-        memcpy(((char*)dst), ((char*)src), nbytes);
+        memcpy(((char *) dst), ((char *) src), nbytes);
     } else if (src_type == GGML_TYPE_F32) {
         if (dst_type == GGML_TYPE_F16) {
-            ggml_fp32_to_fp16_row((float*)src, (ggml_fp16_t*)dst, n);
+            ggml_fp32_to_fp16_row((float *) src, (ggml_fp16_t *) dst, n);
         } else {
             int64_t hist[16];
-            ggml_quantize_chunk(dst_type, (float*)src, dst, 0, n, hist);
+            ggml_quantize_chunk(dst_type, (float *) src, dst, 0, n, hist);
         }
     } else if (dst_type == GGML_TYPE_F32) {
         if (src_type == GGML_TYPE_F16) {
-            ggml_fp16_to_fp32_row((ggml_fp16_t*)src, (float*)dst, n);
+            ggml_fp16_to_fp32_row((ggml_fp16_t *) src, (float *) dst, n);
         } else {
             auto qtype = ggml_internal_get_type_traits(src_type);
             if (qtype.to_float == NULL) {
-                throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available",
-                                                ggml_type_name(src_type)));
+                throw std::runtime_error(
+                        format("type %s unsupported for integer quantization: no dequantization available",
+                               ggml_type_name(src_type)));
             }
-            qtype.to_float(src, (float*)dst, n);
+            qtype.to_float(src, (float *) dst, n);
         }
     } else {
         // src_type == GGML_TYPE_F16 => dst_type is quantized
@@ -505,13 +525,13 @@ void convert_tensor(void* src, ggml_type src_type, void* dst, ggml_type dst_type
         }
         std::vector<char> buf;
         buf.resize(sizeof(float) * n);
-        char* src_data_f32 = buf.data();
-        qtype.to_float(src, (float*)src_data_f32, n);
+        char *src_data_f32 = buf.data();
+        qtype.to_float(src, (float *) src_data_f32, n);
         if (dst_type == GGML_TYPE_F16) {
-            ggml_fp32_to_fp16_row((float*)src_data_f32, (ggml_fp16_t*)dst, n);
+            ggml_fp32_to_fp16_row((float *) src_data_f32, (ggml_fp16_t *) dst, n);
         } else {
             int64_t hist[16];
-            ggml_quantize_chunk(dst_type, (float*)src_data_f32, dst, 0, n, hist);
+            ggml_quantize_chunk(dst_type, (float *) src_data_f32, dst, 0, n, hist);
         }
     }
 }
@@ -549,7 +569,7 @@ std::map<char, int> unicode_to_byte() {
     // byte_decoder = {v: k for k, v in byte_encoder.items()}
     std::map<char, int> byte_decoder;
 
-    for (const auto& entry : byte_to_unicode) {
+    for (const auto &entry: byte_to_unicode) {
         byte_decoder[entry.second] = entry.first;
     }
 
@@ -558,8 +578,8 @@ std::map<char, int> unicode_to_byte() {
     return byte_decoder;
 }
 
-bool is_zip_file(const std::string& file_path) {
-    struct zip_t* zip = zip_open(file_path.c_str(), 0, 'r');
+bool is_zip_file(const std::string &file_path) {
+    struct zip_t *zip = zip_open(file_path.c_str(), 0, 'r');
     if (zip == NULL) {
         return false;
     }
@@ -567,7 +587,7 @@ bool is_zip_file(const std::string& file_path) {
     return true;
 }
 
-bool is_gguf_file(const std::string& file_path) {
+bool is_gguf_file(const std::string &file_path) {
     std::ifstream file(file_path, std::ios::binary);
     if (!file.is_open()) {
         return false;
@@ -588,7 +608,7 @@ bool is_gguf_file(const std::string& file_path) {
     return true;
 }
 
-bool is_safetensors_file(const std::string& file_path) {
+bool is_safetensors_file(const std::string &file_path) {
     std::ifstream file(file_path, std::ios::binary);
     if (!file.is_open()) {
         return false;
@@ -605,7 +625,7 @@ bool is_safetensors_file(const std::string& file_path) {
     }
 
     uint8_t header_size_buf[ST_HEADER_SIZE_LEN];
-    file.read((char*)header_size_buf, ST_HEADER_SIZE_LEN);
+    file.read((char *) header_size_buf, ST_HEADER_SIZE_LEN);
     if (!file) {
         return false;
     }
@@ -630,7 +650,7 @@ bool is_safetensors_file(const std::string& file_path) {
     return true;
 }
 
-bool ModelLoader::init_from_file(const std::string& file_path, const std::string& prefix) {
+bool ModelLoader::init_from_file(const std::string &file_path, const std::string &prefix) {
     if (is_directory(file_path)) {
         LOG_INFO("load %s using diffusers format", file_path.c_str());
         return init_from_diffusers_file(file_path, prefix);
@@ -651,14 +671,14 @@ bool ModelLoader::init_from_file(const std::string& file_path, const std::string
 
 /*================================================= GGUFModelLoader ==================================================*/
 
-bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::string& prefix) {
+bool ModelLoader::init_from_gguf_file(const std::string &file_path, const std::string &prefix) {
     LOG_DEBUG("init from '%s'", file_path.c_str());
     file_paths_.push_back(file_path);
     size_t file_index = file_paths_.size() - 1;
 
-    gguf_context* ctx_gguf_ = NULL;
-    ggml_context* ctx_meta_ = NULL;
-    ctx_gguf_               = gguf_init_from_file(file_path.c_str(), {true, &ctx_meta_});
+    gguf_context *ctx_gguf_ = NULL;
+    ggml_context *ctx_meta_ = NULL;
+    ctx_gguf_ = gguf_init_from_file(file_path.c_str(), {true, &ctx_meta_});
     if (!ctx_gguf_) {
         LOG_ERROR("failed to open '%s'", file_path.c_str());
         return false;
@@ -666,12 +686,12 @@ bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::s
 
     int n_tensors = gguf_get_n_tensors(ctx_gguf_);
 
-    size_t total_size  = 0;
+    size_t total_size = 0;
     size_t data_offset = gguf_get_data_offset(ctx_gguf_);
     for (int i = 0; i < n_tensors; i++) {
-        std::string name          = gguf_get_tensor_name(ctx_gguf_, i);
-        struct ggml_tensor* dummy = ggml_get_tensor(ctx_meta_, name.c_str());
-        size_t offset             = data_offset + gguf_get_tensor_offset(ctx_gguf_, i);
+        std::string name = gguf_get_tensor_name(ctx_gguf_, i);
+        struct ggml_tensor *dummy = ggml_get_tensor(ctx_meta_, name.c_str());
+        size_t offset = data_offset + gguf_get_tensor_offset(ctx_gguf_, i);
 
         // LOG_DEBUG("%s", name.c_str());
 
@@ -690,7 +710,7 @@ bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::s
 
 /*================================================= SafeTensorsModelLoader ==================================================*/
 
-ggml_type str_to_ggml_type(const std::string& dtype) {
+ggml_type str_to_ggml_type(const std::string &dtype) {
     ggml_type ttype = GGML_TYPE_COUNT;
     if (dtype == "F16") {
         ttype = GGML_TYPE_F16;
@@ -703,7 +723,7 @@ ggml_type str_to_ggml_type(const std::string& dtype) {
 }
 
 // https://huggingface.co/docs/safetensors/index
-bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const std::string& prefix) {
+bool ModelLoader::init_from_safetensors_file(const std::string &file_path, const std::string &prefix) {
     LOG_DEBUG("init from '%s'", file_path.c_str());
     file_paths_.push_back(file_path);
     size_t file_index = file_paths_.size() - 1;
@@ -725,7 +745,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
     }
 
     uint8_t header_size_buf[ST_HEADER_SIZE_LEN];
-    file.read((char*)header_size_buf, ST_HEADER_SIZE_LEN);
+    file.read((char *) header_size_buf, ST_HEADER_SIZE_LEN);
     if (!file) {
         LOG_ERROR("read safetensors header size failed: '%s'", file_path.c_str());
         return false;
@@ -749,8 +769,8 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
 
     nlohmann::json header_ = nlohmann::json::parse(header_buf.data());
 
-    for (auto& item : header_.items()) {
-        std::string name           = item.key();
+    for (auto &item: header_.items()) {
+        std::string name = item.key();
         nlohmann::json tensor_info = item.value();
         // LOG_DEBUG("%s %s\n", name.c_str(), tensor_info.dump().c_str());
 
@@ -762,11 +782,11 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
             continue;
         }
 
-        std::string dtype    = tensor_info["dtype"];
+        std::string dtype = tensor_info["dtype"];
         nlohmann::json shape = tensor_info["shape"];
 
         size_t begin = tensor_info["data_offsets"][0].get<size_t>();
-        size_t end   = tensor_info["data_offsets"][1].get<size_t>();
+        size_t end = tensor_info["data_offsets"][1].get<size_t>();
 
         ggml_type type = str_to_ggml_type(dtype);
         if (type == GGML_TYPE_COUNT) {
@@ -779,13 +799,14 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
             return false;
         }
 
-        int n_dims    = (int)shape.size();
+        int n_dims = (int) shape.size();
         int64_t ne[4] = {1, 1, 1, 1};
         for (int i = 0; i < n_dims; i++) {
             ne[i] = shape[i].get<int64_t>();
         }
 
-        TensorStorage tensor_storage(prefix + name, type, ne, n_dims, file_index, ST_HEADER_SIZE_LEN + header_size_ + begin);
+        TensorStorage tensor_storage(prefix + name, type, ne, n_dims, file_index,
+                                     ST_HEADER_SIZE_LEN + header_size_ + begin);
 
         tensor_storage.reverse_ne();
 
@@ -806,9 +827,9 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
 
 /*================================================= DiffusersModelLoader ==================================================*/
 
-bool ModelLoader::init_from_diffusers_file(const std::string& file_path, const std::string& prefix) {
+bool ModelLoader::init_from_diffusers_file(const std::string &file_path, const std::string &prefix) {
     std::string unet_path = path_join(file_path, "unet/diffusion_pytorch_model.safetensors");
-    std::string vae_path  = path_join(file_path, "vae/diffusion_pytorch_model.safetensors");
+    std::string vae_path = path_join(file_path, "vae/diffusion_pytorch_model.safetensors");
     std::string clip_path = path_join(file_path, "text_encoder/model.safetensors");
 
     if (!init_from_safetensors_file(unet_path, "unet.")) {
@@ -923,7 +944,7 @@ struct PickleTensorReader {
         CHECK_SIZE,
         READ_DIMENS
     };
-    ReadPhase phase   = READ_NAME;
+    ReadPhase phase = READ_NAME;
     size_t entry_size = 0;
     int32_t nelements = 0;
 
@@ -936,14 +957,14 @@ struct PickleTensorReader {
         if (phase == CHECK_SIZE) {
             if (entry_size == value * ggml_type_size(tensor_storage.type)) {
                 nelements = value;
-                phase     = READ_DIMENS;
+                phase = READ_DIMENS;
                 return true;
             } else {
                 phase = READ_NAME;
             }
         } else if (phase == READ_DIMENS) {
             if (tensor_storage.n_dims + 1 > 4) {  // too many dimens
-                phase                 = READ_NAME;
+                phase = READ_NAME;
                 tensor_storage.n_dims = 0;
             }
             if (nelements % value == 0) {
@@ -954,23 +975,23 @@ struct PickleTensorReader {
         return false;
     }
 
-    void read_global(const std::string& str) {
+    void read_global(const std::string &str) {
         if (str == "FloatStorage") {
             if (read_global_type) {
-                global_type      = GGML_TYPE_F32;
+                global_type = GGML_TYPE_F32;
                 read_global_type = false;
             }
             tensor_storage.type = GGML_TYPE_F32;
         } else if (str == "HalfStorage") {
             if (read_global_type) {
-                global_type      = GGML_TYPE_F16;
+                global_type = GGML_TYPE_F16;
                 read_global_type = false;
             }
             tensor_storage.type = GGML_TYPE_F16;
         }
     }
 
-    void read_string(const std::string& str, struct zip_t* zip, std::string dir) {
+    void read_string(const std::string &str, struct zip_t *zip, std::string dir) {
         if (str == "storage") {
             read_global_type = true;
         } else if (str != "state_dict") {
@@ -983,8 +1004,8 @@ struct PickleTensorReader {
                     {
                         std::string name = zip_entry_name(zip);
                         if (name == entry_name) {
-                            tensor_storage.index_in_zip = (int)i;
-                            entry_size                  = zip_entry_size(zip);
+                            tensor_storage.index_in_zip = (int) i;
+                            entry_size = zip_entry_size(zip);
                             zip_entry_close(zip);
                             break;
                         }
@@ -996,7 +1017,7 @@ struct PickleTensorReader {
             }
             if (!read_global_type && phase == READ_NAME) {
                 tensor_storage.name = str;
-                phase               = READ_DATA;
+                phase = READ_DATA;
                 tensor_storage.type = global_type;
             }
         }
@@ -1006,7 +1027,7 @@ struct PickleTensorReader {
 ggml_type PickleTensorReader::global_type = GGML_TYPE_F32;  // all pickle_tensors data type
 bool PickleTensorReader::read_global_type = false;
 
-int find_char(uint8_t* buffer, int len, char c) {
+int find_char(uint8_t *buffer, int len, char c) {
     for (int pos = 0; pos < len; pos++) {
         if (buffer[pos] == c) {
             return pos;
@@ -1017,13 +1038,13 @@ int find_char(uint8_t* buffer, int len, char c) {
 
 #define MAX_STRING_BUFFER 512
 
-bool ModelLoader::parse_data_pkl(uint8_t* buffer,
+bool ModelLoader::parse_data_pkl(uint8_t *buffer,
                                  size_t buffer_size,
-                                 zip_t* zip,
+                                 zip_t *zip,
                                  std::string dir,
                                  size_t file_index,
-                                 const std::string& prefix) {
-    uint8_t* buffer_end = buffer + buffer_size;
+                                 const std::string &prefix) {
+    uint8_t *buffer_end = buffer + buffer_size;
     if (buffer[0] == 0x80) {  // proto
         if (buffer[1] != 2) {
             LOG_ERROR("Unsupported protocol\n");
@@ -1044,7 +1065,7 @@ bool ModelLoader::parse_data_pkl(uint8_t* buffer,
                     break;
                 case ']':  // EMPTY_LIST     = b']'   # push empty list
                     break;
-                // skip unused sections
+                    // skip unused sections
                 case 'h':  // BINGET         = b'h'   #   "    "    "    "   "   "  ;   "    " 1-byte arg
                 case 'q':  // BINPUT         = b'q'   #   "     "    "   "   " ;   "    " 1-byte arg
                 case 'Q':  // BINPERSID      = b'Q'   #  "       "         "  ;  "  "   "     "  stack
@@ -1067,7 +1088,8 @@ bool ModelLoader::parse_data_pkl(uint8_t* buffer,
                         buffer++;
                     }
                     buffer++;
-                } break;
+                }
+                    break;
                 case 'M':  // BININT2        = b'M'   # push 2-byte unsigned int
                 {
                     uint16_t value = read_short(buffer);
@@ -1075,7 +1097,8 @@ bool ModelLoader::parse_data_pkl(uint8_t* buffer,
                         buffer++;
                     }
                     buffer += 2;
-                } break;
+                }
+                    break;
                 case 'J':  // BININT         = b'J'   # push four-byte signed int
                 {
                     const int32_t value = read_int(buffer);
@@ -1083,7 +1106,8 @@ bool ModelLoader::parse_data_pkl(uint8_t* buffer,
                         buffer++;  // skip tuple after read num_elements
                     }
                     buffer += 4;
-                } break;
+                }
+                    break;
                 case 'X':  // BINUNICODE     = b'X'   #   "     "       "  ; counted UTF-8 string argument
                 {
                     const int32_t len = read_int(buffer);
@@ -1095,7 +1119,8 @@ bool ModelLoader::parse_data_pkl(uint8_t* buffer,
                     memcpy(string_buffer, buffer, len < MAX_STRING_BUFFER ? len : (MAX_STRING_BUFFER - 1));
                     buffer += len;
                     reader.read_string(string_buffer, zip, dir);
-                } break;
+                }
+                    break;
                 case 0x8C:  // SHORT_BINUNICODE = b'\x8c'  # push short string; UTF-8 length < 256 bytes
                 {
                     const int8_t len = *buffer;
@@ -1104,7 +1129,8 @@ bool ModelLoader::parse_data_pkl(uint8_t* buffer,
                     memcpy(string_buffer, buffer, len);
                     buffer += len;
                     // printf("String: '%s'\n", string_buffer);
-                } break;
+                }
+                    break;
                 case 'c':  // GLOBAL         = b'c'   # push self.find_class(modname, name); 2 string args
                 {
                     int len = find_char(buffer, MAX_STRING_BUFFER, '\n');
@@ -1116,14 +1142,15 @@ bool ModelLoader::parse_data_pkl(uint8_t* buffer,
                     memcpy(string_buffer, buffer, len);
                     buffer += len + 1;
                     reader.read_global(string_buffer);
-                } break;
+                }
+                    break;
                 case 0x86:  // TUPLE2         = b'\x86'  # build 2-tuple from two topmost stack items
                 case 0x85:  // TUPLE1         = b'\x85'  # build 1-tuple from stack top
                 case 't':   // TUPLE          = b't'   # build tuple from topmost stack items
                     if (reader.phase == PickleTensorReader::READ_DIMENS) {
                         reader.tensor_storage.reverse_ne();
                         reader.tensor_storage.file_index = file_index;
-                        reader.tensor_storage.name       = prefix + reader.tensor_storage.name;
+                        reader.tensor_storage.name = prefix + reader.tensor_storage.name;
                         tensor_storages.push_back(reader.tensor_storage);
                         // LOG_DEBUG("%s", reader.tensor_storage.name.c_str());
                         // reset
@@ -1141,31 +1168,31 @@ bool ModelLoader::parse_data_pkl(uint8_t* buffer,
     return true;
 }
 
-bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::string& prefix) {
+bool ModelLoader::init_from_ckpt_file(const std::string &file_path, const std::string &prefix) {
     LOG_DEBUG("init from '%s'", file_path.c_str());
     file_paths_.push_back(file_path);
     size_t file_index = file_paths_.size() - 1;
 
-    struct zip_t* zip = zip_open(file_path.c_str(), 0, 'r');
+    struct zip_t *zip = zip_open(file_path.c_str(), 0, 'r');
     if (zip == NULL) {
         LOG_ERROR("failed to open '%s'", file_path.c_str());
         return false;
     }
-    int n = (int)zip_entries_total(zip);
+    int n = (int) zip_entries_total(zip);
     for (int i = 0; i < n; ++i) {
         zip_entry_openbyindex(zip, i);
         {
             std::string name = zip_entry_name(zip);
-            size_t pos       = name.find("data.pkl");
+            size_t pos = name.find("data.pkl");
             if (pos != std::string::npos) {
                 std::string dir = name.substr(0, pos);
-                void* pkl_data  = NULL;
+                void *pkl_data = NULL;
                 size_t pkl_size;
                 zip_entry_read(zip, &pkl_data, &pkl_size);
 
                 // LOG_DEBUG("%lld", pkl_size);
 
-                parse_data_pkl((uint8_t*)pkl_data, pkl_size, zip, dir, file_index, prefix);
+                parse_data_pkl((uint8_t *) pkl_data, pkl_size, zip, dir, file_index, prefix);
 
                 free(pkl_data);
             }
@@ -1179,7 +1206,7 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s
 SDVersion ModelLoader::get_sd_version() {
     // return VERSION_1_x;
     TensorStorage token_embedding_weight;
-    for (auto& tensor_storage : tensor_storages) {
+    for (auto &tensor_storage: tensor_storages) {
         if (tensor_storage.name.find("conditioner.embedders.1") != std::string::npos) {
             return VERSION_XL;
         }
@@ -1205,7 +1232,7 @@ SDVersion ModelLoader::get_sd_version() {
 }
 
 ggml_type ModelLoader::get_sd_wtype() {
-    for (auto& tensor_storage : tensor_storages) {
+    for (auto &tensor_storage: tensor_storages) {
         if (is_unused_tensor(tensor_storage.name)) {
             continue;
         }
@@ -1219,16 +1246,16 @@ ggml_type ModelLoader::get_sd_wtype() {
 }
 
 std::string ModelLoader::load_merges() {
-    std::string merges_utf8_str(reinterpret_cast<const char*>(merges_utf8_c_str), sizeof(merges_utf8_c_str));
+    std::string merges_utf8_str(reinterpret_cast<const char *>(merges_utf8_c_str), sizeof(merges_utf8_c_str));
     return merges_utf8_str;
 }
 
-void remove_duplicates(std::vector<TensorStorage>& vec) {
+void remove_duplicates(std::vector<TensorStorage> &vec) {
     std::unordered_map<std::string, size_t> name_to_index_map;
 
     for (size_t i = 0; i < vec.size(); ++i) {
-        const std::string& current_name = vec[i].name;
-        auto it                         = name_to_index_map.find(current_name);
+        const std::string &current_name = vec[i].name;
+        auto it = name_to_index_map.find(current_name);
 
         if (it != name_to_index_map.end()) {
             vec[it->second] = vec[i];
@@ -1242,7 +1269,7 @@ void remove_duplicates(std::vector<TensorStorage>& vec) {
 
 bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend) {
     std::vector<TensorStorage> processed_tensor_storages;
-    for (auto& tensor_storage : tensor_storages) {
+    for (auto &tensor_storage: tensor_storages) {
         // LOG_DEBUG("%s", name.c_str());
 
         if (is_unused_tensor(tensor_storage.name)) {
@@ -1264,7 +1291,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
         }
 
         bool is_zip = false;
-        for (auto& tensor_storage : tensor_storages) {
+        for (auto &tensor_storage: tensor_storages) {
             if (tensor_storage.file_index != file_index) {
                 continue;
             }
@@ -1274,7 +1301,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
             }
         }
 
-        struct zip_t* zip = NULL;
+        struct zip_t *zip = NULL;
         if (is_zip) {
             zip = zip_open(file_path.c_str(), 0, 'r');
             if (zip == NULL) {
@@ -1286,16 +1313,16 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
         std::vector<uint8_t> read_buffer;
         std::vector<uint8_t> convert_buffer;
 
-        auto read_data = [&](const TensorStorage& tensor_storage, char* buf, size_t n) {
+        auto read_data = [&](const TensorStorage &tensor_storage, char *buf, size_t n) {
             if (zip != NULL) {
                 zip_entry_openbyindex(zip, tensor_storage.index_in_zip);
                 size_t entry_size = zip_entry_size(zip);
                 if (entry_size != n) {
                     read_buffer.resize(entry_size);
-                    zip_entry_noallocread(zip, (void*)read_buffer.data(), entry_size);
-                    memcpy((void*)buf, (void*)(read_buffer.data() + tensor_storage.offset), n);
+                    zip_entry_noallocread(zip, (void *) read_buffer.data(), entry_size);
+                    memcpy((void *) buf, (void *) (read_buffer.data() + tensor_storage.offset), n);
                 } else {
-                    zip_entry_noallocread(zip, (void*)buf, n);
+                    zip_entry_noallocread(zip, (void *) buf, n);
                 }
                 zip_entry_close(zip);
             } else {
@@ -1309,13 +1336,13 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
             return true;
         };
 
-        for (auto& tensor_storage : processed_tensor_storages) {
+        for (auto &tensor_storage: processed_tensor_storages) {
             if (tensor_storage.file_index != file_index) {
                 continue;
             }
             // LOG_DEBUG("%s", tensor_storage.name.c_str());
 
-            ggml_tensor* dst_tensor = NULL;
+            ggml_tensor *dst_tensor = NULL;
 
             success = on_new_tensor_cb(tensor_storage, &dst_tensor);
             if (!success) {
@@ -1333,35 +1360,38 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
 #ifdef SD_USE_METAL
                 || ggml_backend_is_metal(backend)
 #endif
-            ) {
+                    ) {
                 // for the CPU and Metal backend, we can copy directly into the tensor
                 if (tensor_storage.type == dst_tensor->type) {
                     GGML_ASSERT(ggml_nbytes(dst_tensor) == tensor_storage.nbytes());
-                    read_data(tensor_storage, (char*)dst_tensor->data, nbytes_to_read);
+                    read_data(tensor_storage, (char *) dst_tensor->data, nbytes_to_read);
 
                     if (tensor_storage.is_bf16) {
                         // inplace op
-                        bf16_to_f32_vec((uint16_t*)dst_tensor->data, (float*)dst_tensor->data, tensor_storage.nelements());
+                        bf16_to_f32_vec((uint16_t *) dst_tensor->data, (float *) dst_tensor->data,
+                                        tensor_storage.nelements());
                     }
                 } else {
                     read_buffer.resize(tensor_storage.nbytes());
-                    read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);
+                    read_data(tensor_storage, (char *) read_buffer.data(), nbytes_to_read);
 
                     if (tensor_storage.is_bf16) {
                         // inplace op
-                        bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
+                        bf16_to_f32_vec((uint16_t *) read_buffer.data(), (float *) read_buffer.data(),
+                                        tensor_storage.nelements());
                     }
 
-                    convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data,
-                                   dst_tensor->type, (int)tensor_storage.nelements());
+                    convert_tensor((void *) read_buffer.data(), tensor_storage.type, dst_tensor->data,
+                                   dst_tensor->type, (int) tensor_storage.nelements());
                 }
             } else {
                 read_buffer.resize(tensor_storage.nbytes());
-                read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);
+                read_data(tensor_storage, (char *) read_buffer.data(), nbytes_to_read);
 
                 if (tensor_storage.is_bf16) {
                     // inplace op
-                    bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
+                    bf16_to_f32_vec((uint16_t *) read_buffer.data(), (float *) read_buffer.data(),
+                                    tensor_storage.nelements());
                 }
 
                 if (tensor_storage.type == dst_tensor->type) {
@@ -1370,9 +1400,9 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
                 } else {
                     // convert first, then copy to device memory
                     convert_buffer.resize(ggml_nbytes(dst_tensor));
-                    convert_tensor((void*)read_buffer.data(), tensor_storage.type,
-                                   (void*)convert_buffer.data(), dst_tensor->type,
-                                   (int)tensor_storage.nelements());
+                    convert_tensor((void *) read_buffer.data(), tensor_storage.type,
+                                   (void *) convert_buffer.data(), dst_tensor->type,
+                                   (int) tensor_storage.nelements());
                     ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor));
                 }
             }
@@ -1389,35 +1419,41 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
     return success;
 }
 
-bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
+bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor *> &tensors,
                                ggml_backend_t backend,
-                               std::set<std::string> ignore_tensors) {
+                               std::set<std::string> ignore_tensors,
+                               bool standalone) {
     std::set<std::string> tensor_names_in_file;
-    auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
-        const std::string& name = tensor_storage.name;
+    auto on_new_tensor_cb = [&](const TensorStorage &tensor_storage, ggml_tensor **dst_tensor) -> bool {
+        const std::string &name = tensor_storage.name;
         tensor_names_in_file.insert(name);
 
-        struct ggml_tensor* real;
+        struct ggml_tensor *real;
         if (tensors.find(name) != tensors.end()) {
             real = tensors[name];
         } else {
             if (ignore_tensors.find(name) == ignore_tensors.end()) {
-                LOG_WARN("unknown tensor '%s' in model file", name.c_str());
+                if (standalone) {
+                    LOG_WARN("unknown tensor '%s' in model file", name.c_str());
+                } else {
+                    LOG_DEBUG("unknown tensor '%s' in model file", name.c_str());
+                }
             }
             return true;
         }
 
         if (
-            real->ne[0] != tensor_storage.ne[0] ||
-            real->ne[1] != tensor_storage.ne[1] ||
-            real->ne[2] != tensor_storage.ne[2] ||
-            real->ne[3] != tensor_storage.ne[3]) {
+                real->ne[0] != tensor_storage.ne[0] ||
+                real->ne[1] != tensor_storage.ne[1] ||
+                real->ne[2] != tensor_storage.ne[2] ||
+                real->ne[3] != tensor_storage.ne[3]) {
             LOG_ERROR(
-                "tensor '%s' has wrong shape in model file: "
-                "got [%d, %d, %d, %d], expected [%d, %d, %d, %d]",
-                name.c_str(),
-                (int)tensor_storage.ne[0], (int)tensor_storage.ne[1], (int)tensor_storage.ne[2], (int)tensor_storage.ne[3],
-                (int)real->ne[0], (int)real->ne[1], (int)real->ne[2], (int)real->ne[3]);
+                    "tensor '%s' has wrong shape in model file: "
+                    "got [%d, %d, %d, %d], expected [%d, %d, %d, %d]",
+                    name.c_str(),
+                    (int) tensor_storage.ne[0], (int) tensor_storage.ne[1], (int) tensor_storage.ne[2],
+                    (int) tensor_storage.ne[3],
+                    (int) real->ne[0], (int) real->ne[1], (int) real->ne[2], (int) real->ne[3]);
             return false;
         }
 
@@ -1434,7 +1470,7 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
 
     bool some_tensor_not_init = false;
 
-    for (auto pair : tensors) {
+    for (auto pair: tensors) {
         if (pair.first.find("cond_stage_model.transformer.text_model.encoder.layers.23") != std::string::npos) {
             continue;
         }
@@ -1458,18 +1494,18 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
     return true;
 }
 
-bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type) {
-    auto backend    = ggml_backend_cpu_init();
+bool ModelLoader::save_to_gguf_file(const std::string &file_path, ggml_type type) {
+    auto backend = ggml_backend_cpu_init();
     size_t mem_size = 1 * 1024 * 1024;  // for padding
     mem_size += tensor_storages.size() * ggml_tensor_overhead();
     mem_size += cal_mem_size(backend, type);
     LOG_INFO("model tensors mem size: %.2fMB", mem_size / 1024.f / 1024.f);
-    ggml_context* ggml_ctx = ggml_init({mem_size, NULL, false});
+    ggml_context *ggml_ctx = ggml_init({mem_size, NULL, false});
 
-    gguf_context* gguf_ctx = gguf_init_empty();
+    gguf_context *gguf_ctx = gguf_init_empty();
 
-    auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
-        const std::string& name = tensor_storage.name;
+    auto on_new_tensor_cb = [&](const TensorStorage &tensor_storage, ggml_tensor **dst_tensor) -> bool {
+        const std::string &name = tensor_storage.name;
 
         ggml_type tensor_type = tensor_storage.type;
         if (type != GGML_TYPE_COUNT) {
@@ -1480,7 +1516,7 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type
             }
         }
 
-        ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne);
+        ggml_tensor *tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne);
         if (tensor == NULL) {
             LOG_ERROR("ggml_new_tensor failed");
             return false;
@@ -1519,14 +1555,14 @@ int64_t ModelLoader::cal_mem_size(ggml_backend_t backend, ggml_type type) {
     }
     int64_t mem_size = 0;
     std::vector<TensorStorage> processed_tensor_storages;
-    for (auto& tensor_storage : tensor_storages) {
+    for (auto &tensor_storage: tensor_storages) {
         if (is_unused_tensor(tensor_storage.name)) {
             continue;
         }
         preprocess_tensor(tensor_storage, processed_tensor_storages);
     }
 
-    for (auto& tensor_storage : processed_tensor_storages) {
+    for (auto &tensor_storage: processed_tensor_storages) {
         ggml_type tensor_type = tensor_storage.type;
         if (type != GGML_TYPE_COUNT) {
             if (ggml_is_quantized(type) && tensor_storage.ne[0] % 32 != 0) {
@@ -1542,7 +1578,7 @@ int64_t ModelLoader::cal_mem_size(ggml_backend_t backend, ggml_type type) {
     return mem_size;
 }
 
-bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type) {
+bool convert(const char *input_path, const char *vae_path, const char *output_path, sd_type_t output_type) {
     ModelLoader model_loader;
 
     if (!model_loader.init_from_file(input_path)) {
@@ -1556,6 +1592,6 @@ bool convert(const char* input_path, const char* vae_path, const char* output_pa
             return false;
         }
     }
-    bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type);
+    bool success = model_loader.save_to_gguf_file(output_path, (ggml_type) output_type);
     return success;
 }
\ No newline at end of file
diff --git a/model.h b/model.h
index 4b692a30..b0d61547 100644
--- a/model.h
+++ b/model.h
@@ -120,7 +120,7 @@ class ModelLoader {
     bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend);
     bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
                       ggml_backend_t backend,
-                      std::set<std::string> ignore_tensors = {});
+                      std::set<std::string> ignore_tensors = {}, bool standalone=true);
     bool save_to_gguf_file(const std::string& file_path, ggml_type type);
     int64_t cal_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
     ~ModelLoader() = default;
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 10e24585..3954e326 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -14,59 +14,60 @@
 #include "unet.hpp"
 #include "vae.hpp"
 
-const char* model_version_to_str[] = {
-    "1.x",
-    "2.x",
-    "XL",
+const char *model_version_to_str[] = {
+        "1.x",
+        "2.x",
+        "XL",
 };
 
-const char* sampling_methods_str[] = {
-    "Euler A",
-    "Euler",
-    "Heun",
-    "DPM2",
-    "DPM++ (2s)",
-    "DPM++ (2M)",
-    "modified DPM++ (2M)",
-    "LCM",
+const char *sampling_methods_str[] = {
+        "Euler A",
+        "Euler",
+        "Heun",
+        "DPM2",
+        "DPM++ (2s)",
+        "DPM++ (2M)",
+        "modified DPM++ (2M)",
+        "LCM",
 };
 
 /*================================================== Helper Functions ================================================*/
 
-void calculate_alphas_cumprod(float* alphas_cumprod,
+void calculate_alphas_cumprod(float *alphas_cumprod,
                               float linear_start = 0.00085f,
-                              float linear_end   = 0.0120,
-                              int timesteps      = TIMESTEPS) {
+                              float linear_end = 0.0120,
+                              int timesteps = TIMESTEPS) {
     float ls_sqrt = sqrtf(linear_start);
     float le_sqrt = sqrtf(linear_end);
-    float amount  = le_sqrt - ls_sqrt;
+    float amount = le_sqrt - ls_sqrt;
     float product = 1.0f;
     for (int i = 0; i < timesteps; i++) {
-        float beta = ls_sqrt + amount * ((float)i / (timesteps - 1));
+        float beta = ls_sqrt + amount * ((float) i / (timesteps - 1));
         product *= 1.0f - powf(beta, 2.0f);
         alphas_cumprod[i] = product;
     }
 }
 
+
 /*=============================================== StableDiffusionGGML ================================================*/
 
 class StableDiffusionGGML {
 public:
     SDVersion version;
-    bool vae_decode_only         = false;
+    bool vae_decode_only = false;
     bool free_params_immediately = false;
 
     std::shared_ptr<RNG> rng = std::make_shared<STDDefaultRNG>();
-    int n_threads            = -1;
-    float scale_factor       = 0.18215f;
+    int n_threads = -1;
+    float scale_factor = 0.18215f;
 
     FrozenCLIPEmbedderWithCustomWords cond_stage_model;
     UNetModel diffusion_model;
     AutoEncoderKL first_stage_model;
     bool use_tiny_autoencoder = false;
-    bool vae_tiling           = false;
+    bool vae_tiling = false;
 
-    std::map<std::string, struct ggml_tensor*> tensors;
+    std::map<std::string, struct ggml_tensor *> tensors;
 
     std::string lora_model_dir;
     // lora_name => multiplier
@@ -74,45 +75,52 @@ class StableDiffusionGGML {
     std::map<std::string, LoraModel> loras;
 
     std::shared_ptr<Denoiser> denoiser = std::make_shared<CompVisDenoiser>();
-    ggml_backend_t backend             = NULL;  // general backend
-    ggml_type model_data_type          = GGML_TYPE_COUNT;
+    schedule_t schedule = DEFAULT;
+
+    ggml_backend_t backend = NULL;  // general backend
+    ggml_type model_data_type = GGML_TYPE_COUNT; // runtime weight type
+    ggml_type wtype = GGML_TYPE_COUNT; // options weight type
 
     TinyAutoEncoder tae_first_stage;
     std::string taesd_path;
 
+    ModelLoader model_loader;
+
     StableDiffusionGGML() = default;
 
     StableDiffusionGGML(int n_threads,
                         bool vae_decode_only,
                         bool free_params_immediately,
                         std::string lora_model_dir,
-                        rng_type_t rng_type)
-        : n_threads(n_threads),
-          vae_decode_only(vae_decode_only),
-          free_params_immediately(free_params_immediately),
-          lora_model_dir(lora_model_dir) {
+                        rng_type_t rng_type,
+                        bool vae_tiling,
+                        ggml_type wtype,
+                        schedule_t schedule,
+                        bool init_backend_immediately = true)
+            : n_threads(n_threads),
+              vae_decode_only(vae_decode_only),
+              free_params_immediately(free_params_immediately),
+              lora_model_dir(lora_model_dir),
+              vae_tiling(vae_tiling),
+              wtype(wtype),
+              schedule(schedule) {
         first_stage_model.decode_only = vae_decode_only;
-        tae_first_stage.decode_only   = vae_decode_only;
+        tae_first_stage.decode_only = vae_decode_only;
         if (rng_type == STD_DEFAULT_RNG) {
             rng = std::make_shared<STDDefaultRNG>();
         } else if (rng_type == CUDA_RNG) {
             rng = std::make_shared<PhiloxRNG>();
         }
+        if (init_backend_immediately) {
+            init_backend();
+        }
     }
 
     ~StableDiffusionGGML() {
         ggml_backend_free(backend);
     }
 
-    bool load_from_file(const std::string& model_path,
-                        const std::string& vae_path,
-                        const std::string& taesd_path,
-                        bool vae_tiling,
-                        ggml_type wtype,
-                        schedule_t schedule) {
-        this->use_tiny_autoencoder = taesd_path.size() > 0;
-        this->taesd_path           = taesd_path;
-        this->vae_tiling           = vae_tiling;
+    void init_backend() {
 #ifdef SD_USE_CUBLAS
         LOG_DEBUG("Using CUDA backend");
         backend = ggml_backend_cuda_init(0);
@@ -134,18 +142,43 @@ class StableDiffusionGGML {
         LOG_INFO("Flash Attention enabled");
 #endif
 #endif
-        LOG_INFO("loading model from '%s'", model_path.c_str());
-        ModelLoader model_loader;
+    }
+
+    void set_options(int n_threads,
+                     bool vae_decode_only,
+                     bool free_params_immediately,
+                     std::string lora_model_dir,
+                     rng_type_t rng_type,
+                     bool vae_tiling,
+                     sd_type_t wtype,
+                     schedule_t schedule
+    ) {
+        this->n_threads = n_threads;
+        this->vae_decode_only = vae_decode_only;
+        this->free_params_immediately = free_params_immediately;
+        this->lora_model_dir = lora_model_dir;
+        if (rng_type == STD_DEFAULT_RNG) {
+            rng = std::make_shared<STDDefaultRNG>();
+        } else if (rng_type == CUDA_RNG) {
+            rng = std::make_shared<PhiloxRNG>();
+        }
+        this->vae_tiling = vae_tiling;
+        this->wtype = (ggml_type) wtype;
+        this->schedule = schedule;
+        apply_schedule();
+    }
 
-        if (!model_loader.init_from_file(model_path)) {
-            LOG_ERROR("init model loader from file failed: '%s'", model_path.c_str());
+    bool load_clip_from_file(const std::string &model_path, bool standalone = true, const std::string &prefix = "te.") {
+        if (backend == NULL) {
+            LOG_ERROR("if you set init_backend_immediately false, please call init_backend first");
             return false;
         }
 
-        if (vae_path.size() > 0) {
-            LOG_INFO("loading vae from '%s'", vae_path.c_str());
-            if (!model_loader.init_from_file(vae_path, "vae.")) {
-                LOG_WARN("loading vae from '%s' failed", vae_path.c_str());
+        if (!model_path.empty()) {
+            LOG_INFO("loading clip from '%s'", model_path.c_str());
+            if (!model_loader.init_from_file(model_path, prefix)) {
+                LOG_WARN("loading clip from '%s' failed", model_path.c_str());
+                return false;
             }
         }
 
@@ -154,18 +187,30 @@ class StableDiffusionGGML {
             LOG_ERROR("get sd version from file failed: '%s'", model_path.c_str());
             return false;
         }
+
         if (version == VERSION_XL) {
             scale_factor = 0.13025f;
         }
+
         cond_stage_model = FrozenCLIPEmbedderWithCustomWords(version);
-        diffusion_model  = UNetModel(version);
 
         LOG_INFO("Stable Diffusion %s ", model_version_to_str[version]);
+
+        auto autodiscover_wtype = model_loader.get_sd_wtype();
+
         if (wtype == GGML_TYPE_COUNT) {
-            model_data_type = model_loader.get_sd_wtype();
+            model_data_type = autodiscover_wtype;
         } else {
-            model_data_type = wtype;
+            if (wtype > autodiscover_wtype) {
+                LOG_WARN("Stable Diffusion weight type can't set to %s, so set default: %s",
+                         ggml_type_name(wtype),
+                         ggml_type_name(model_data_type));
+                model_data_type = autodiscover_wtype;
+            } else {
+                model_data_type = wtype;
+            }
         }
+
         LOG_INFO("Stable Diffusion weight type: %s", ggml_type_name(model_data_type));
 
         LOG_DEBUG("loading vocab");
@@ -177,99 +222,132 @@ class StableDiffusionGGML {
 
         cond_stage_model.tokenizer.load_from_merges(merges_utf8_str);
 
-        // create the ggml context for network params
-        LOG_DEBUG("ggml tensor size = %d bytes", (int)sizeof(ggml_tensor));
+        if (!cond_stage_model.alloc_params_buffer(backend, model_data_type)) {
+            return false;
+        }
 
-        if (
-            !cond_stage_model.alloc_params_buffer(backend, model_data_type) ||
-            !diffusion_model.alloc_params_buffer(backend, model_data_type)) {
+        LOG_DEBUG("preparing memory for clip weights");
+        // prepare memory for the weights
+        {
+            cond_stage_model.init_params();
+            cond_stage_model.map_by_name(tensors, "cond_stage_model.");
+        }
+
+        struct ggml_init_params params;
+        params.mem_size = static_cast<size_t>(3 * 1024) * 1024;  // 10M
+        params.mem_buffer = NULL;
+        params.no_alloc = false;
+        // LOG_DEBUG("mem_size %u ", params.mem_size);
+        struct ggml_context *ctx = ggml_init(params);  // for  alphas_cumprod and is_using_v_parameterization check
+        if (!ctx) {
+            LOG_ERROR("ggml_init() failed");
             return false;
         }
 
-        ggml_type vae_type = model_data_type;
-        if (version == VERSION_XL) {
-            vae_type = GGML_TYPE_F32;  // avoid nan, not work...
+        // load weights
+        LOG_DEBUG("loading clip weights");
+        int64_t t0 = ggml_time_ms();
+
+        std::map<std::string, struct ggml_tensor *> tensors_need_to_load;
+        std::set<std::string> ignore_tensors;
+
+        for (auto &pair: tensors) {
+            tensors_need_to_load.insert(pair);
+        }
+
+        bool success = model_loader.load_tensors(tensors_need_to_load, backend, ignore_tensors, standalone);
+        if (!success) {
+            LOG_ERROR("load tensors from clip model failed");
+            ggml_free(ctx);
+            return false;
+        }
+
+        LOG_INFO("clip memory buffer size = %.2fMB", cond_stage_model.params_buffer_size / 1024.0 / 1024.0);
+        int64_t t1 = ggml_time_ms();
+        LOG_INFO("loading clip model from '%s' completed, taking %.2fs", model_path.c_str(), (t1 - t0) * 1.0f / 1000);
+        ggml_free(ctx);
+        return true;
+    }
+
+    void free_clip_params() {
+        if (cond_stage_model.params_buffer_size > 0) {
+            cond_stage_model.free_params_buffer();
+        }
+    }
+
+    bool load_unet_from_file(const std::string &model_path,
+                             bool standalone = true,
+                             const std::string &prefix = "unet.") {
+        if (backend == NULL) {
+            LOG_ERROR("if you set init_backend_immediately false, please call init_backend first");
+            return false;
+        }
+
+        if (version == VERSION_COUNT) {
+            LOG_ERROR("get sd version from file failed: '%s' ,make sure clip model has loaded", model_path.c_str());
+            return false;
+        }
+
+        if (!model_path.empty() && standalone) {
+            LOG_INFO("loading unet from '%s'", model_path.c_str());
+            if (!model_loader.init_from_file(model_path, prefix)) {
+                LOG_WARN("loading unet from '%s' failed", model_path.c_str());
+                return false;
+            }
         }
 
-        if (!use_tiny_autoencoder && !first_stage_model.alloc_params_buffer(backend, vae_type)) {
+        diffusion_model = UNetModel(version);
+        if (!diffusion_model.alloc_params_buffer(backend, model_data_type)) {
             return false;
         }
 
-        LOG_DEBUG("preparing memory for the weights");
+        LOG_DEBUG("preparing memory for unet weights");
         // prepare memory for the weights
         {
-            // cond_stage_model(FrozenCLIPEmbedder)
-            cond_stage_model.init_params();
-            cond_stage_model.map_by_name(tensors, "cond_stage_model.");
-
             // diffusion_model(UNetModel)
             diffusion_model.init_params();
             diffusion_model.map_by_name(tensors, "model.diffusion_model.");
-
-            if (!use_tiny_autoencoder) {
-                // firest_stage_model(AutoEncoderKL)
-                first_stage_model.init_params();
-            }
-            first_stage_model.map_by_name(tensors, "first_stage_model.");
         }
 
         struct ggml_init_params params;
-        params.mem_size   = static_cast<size_t>(10 * 1024) * 1024;  // 10M
+        params.mem_size = static_cast<size_t>(3 * 1024) * 1024;  // 10M
         params.mem_buffer = NULL;
-        params.no_alloc   = false;
-        // LOG_DEBUG("mem_size %u ", params.mem_size);
-        struct ggml_context* ctx = ggml_init(params);  // for  alphas_cumprod and is_using_v_parameterization check
+        params.no_alloc = false;
+
+        struct ggml_context *ctx = ggml_init(params);  // for  alphas_cumprod and is_using_v_parameterization check
+
         if (!ctx) {
             LOG_ERROR("ggml_init() failed");
             return false;
         }
-        ggml_tensor* alphas_cumprod_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, TIMESTEPS);
-        calculate_alphas_cumprod((float*)alphas_cumprod_tensor->data);
 
         // load weights
         LOG_DEBUG("loading weights");
         int64_t t0 = ggml_time_ms();
 
-        std::map<std::string, struct ggml_tensor*> tensors_need_to_load;
+        std::map<std::string, struct ggml_tensor *> tensors_need_to_load;
         std::set<std::string> ignore_tensors;
+        ggml_tensor *alphas_cumprod_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, TIMESTEPS);
+        calculate_alphas_cumprod((float *) alphas_cumprod_tensor->data);
         tensors_need_to_load["alphas_cumprod"] = alphas_cumprod_tensor;
-        for (auto& pair : tensors) {
-            const std::string& name = pair.first;
-
-            if (use_tiny_autoencoder && starts_with(name, "first_stage_model.")) {
-                ignore_tensors.insert(name);
-                continue;
-            }
-
-            if (vae_decode_only && (starts_with(name, "first_stage_model.encoder") || starts_with(name, "first_stage_model.quant"))) {
+        for (auto &pair: tensors) {
+            const std::string &name = pair.first;
+            if (starts_with(name, "cond_stage_model.") || starts_with(name, "first_stage_model.")) {
                 ignore_tensors.insert(name);
                 continue;
             }
-
             tensors_need_to_load.insert(pair);
         }
-        bool success = model_loader.load_tensors(tensors_need_to_load, backend, ignore_tensors);
+        bool success = model_loader.load_tensors(tensors_need_to_load, backend, ignore_tensors, standalone);
         if (!success) {
-            LOG_ERROR("load tensors from model loader failed");
+            LOG_ERROR("load unet tensors from model loader failed");
             ggml_free(ctx);
             return false;
         }
-
-        // LOG_DEBUG("model size = %.2fMB", total_size / 1024.0 / 1024.0);
-
-        size_t total_params_size =
-            cond_stage_model.params_buffer_size +
-            diffusion_model.params_buffer_size +
-            first_stage_model.params_buffer_size;
-        LOG_INFO("total memory buffer size = %.2fMB (clip %.2fMB, unet %.2fMB, vae %.2fMB)",
-                 total_params_size / 1024.0 / 1024.0,
-                 cond_stage_model.params_buffer_size / 1024.0 / 1024.0,
-                 diffusion_model.params_buffer_size / 1024.0 / 1024.0,
-                 first_stage_model.params_buffer_size / 1024.0 / 1024.0);
+        LOG_INFO("unet memory buffer size = %.2fMB", diffusion_model.params_buffer_size / 1024.0 / 1024.0);
         int64_t t1 = ggml_time_ms();
-        LOG_INFO("loading model from '%s' completed, taking %.2fs", model_path.c_str(), (t1 - t0) * 1.0f / 1000);
+        LOG_INFO("loading unet model from '%s' completed, taking %.2fs", model_path.c_str(), (t1 - t0) * 1.0f / 1000);
 
-        // check is_using_v_parameterization_for_sd2
         bool is_using_v_parameterization = false;
         if (version == VERSION_2_x) {
             if (is_using_v_parameterization_for_sd2(ctx)) {
@@ -284,64 +362,183 @@ class StableDiffusionGGML {
             LOG_INFO("running in eps-prediction mode");
         }
 
-        if (schedule != DEFAULT) {
-            switch (schedule) {
-                case DISCRETE:
-                    LOG_INFO("running with discrete schedule");
-                    denoiser->schedule = std::make_shared<DiscreteSchedule>();
-                    break;
-                case KARRAS:
-                    LOG_INFO("running with Karras schedule");
-                    denoiser->schedule = std::make_shared<KarrasSchedule>();
-                    break;
-                case DEFAULT:
-                    // Don't touch anything.
-                    break;
-                default:
-                    LOG_ERROR("Unknown schedule %i", schedule);
-                    abort();
+        apply_schedule();
+        ggml_free(ctx);
+        return true;
+    }
+
+
+    void free_unet_params() {
+        if (diffusion_model.params_buffer_size > 0) {
+            diffusion_model.free_params_buffer();
+        }
+    }
+
+    bool load_vae_from_file(const std::string &model_path,
+                            bool standalone = true,
+                            const std::string &prefix = "vae.") {
+        if (backend == NULL) {
+            LOG_ERROR("if you set init_backend_immediately false, please call init_backend first");
+            return false;
+        }
+
+        if (version == VERSION_COUNT) {
+            LOG_ERROR("get sd version from file failed: '%s' ,please call load_clip_from_file first",
+                      model_path.c_str());
+            return false;
+        }
+
+        if (!model_path.empty() && standalone) {
+            LOG_INFO("loading vae from '%s'", model_path.c_str());
+            if (!model_loader.init_from_file(model_path, prefix)) {
+                LOG_WARN("loading vae from '%s' failed", model_path.c_str());
+                return false;
             }
         }
 
-        for (int i = 0; i < TIMESTEPS; i++) {
-            denoiser->schedule->alphas_cumprod[i] = ((float*)alphas_cumprod_tensor->data)[i];
-            denoiser->schedule->sigmas[i]         = std::sqrt((1 - denoiser->schedule->alphas_cumprod[i]) / denoiser->schedule->alphas_cumprod[i]);
-            denoiser->schedule->log_sigmas[i]     = std::log(denoiser->schedule->sigmas[i]);
+        ggml_type vae_type = model_data_type;
+        if (version == VERSION_XL) {
+            vae_type = GGML_TYPE_F32;  // avoid nan, not work...
+        }
+
+        if (!first_stage_model.alloc_params_buffer(backend, vae_type)) {
+            return false;
+        }
+
+        LOG_DEBUG("preparing memory for vae weights");
+        // prepare memory for the weights
+        {
+            first_stage_model.init_params();
+            first_stage_model.map_by_name(tensors, "first_stage_model.");
+        }
+
+        struct ggml_init_params params;
+        params.mem_size = static_cast<size_t>(10 * 1024) * 1024;  // 10M
+        params.mem_buffer = NULL;
+        params.no_alloc = false;
+        // LOG_DEBUG("mem_size %u ", params.mem_size);
+        struct ggml_context *ctx = ggml_init(params);  // for  alphas_cumprod and is_using_v_parameterization check
+        if (!ctx) {
+            LOG_ERROR("ggml_init() failed");
+            return false;
+        }
+
+        // load weights
+        LOG_DEBUG("loading weights");
+        int64_t t0 = ggml_time_ms();
+
+        std::map<std::string, struct ggml_tensor *> tensors_need_to_load;
+        std::set<std::string> ignore_tensors;
+        for (auto &pair: tensors) {
+            const std::string &name = pair.first;
+            // TODO: make it can reload in compute time. so we can set vae_decode_only dynamic.
+            if (vae_decode_only &&
+                (starts_with(name, "first_stage_model.encoder") || starts_with(name, "first_stage_model.quant"))) {
+                ignore_tensors.insert(name);
+                continue;
+            }
+
+            tensors_need_to_load.insert(pair);
         }
-        LOG_DEBUG("finished loaded file");
+        bool success = model_loader.load_tensors(tensors_need_to_load, backend, ignore_tensors, standalone);
+        if (!success) {
+            LOG_ERROR("load tensors from model loader failed");
+            ggml_free(ctx);
+            return false;
+        }
+        LOG_INFO("vae memory buffer size = %.2fMB", first_stage_model.params_buffer_size / 1024.0 / 1024.0);
+        int64_t t1 = ggml_time_ms();
+        LOG_INFO("loading vae model from '%s' completed, taking %.2fs", model_path.c_str(), (t1 - t0) * 1.0f / 1000);
         ggml_free(ctx);
-        if (use_tiny_autoencoder) {
-            return tae_first_stage.load_from_file(taesd_path, backend);
+        return true;
+    }
+
+    void free_vae_params() {
+        if (first_stage_model.params_buffer_size > 0) {
+            first_stage_model.free_params_buffer();
+        }
+    }
+
+    //load the all model from one file
+    bool load_diffusions_from_file(const std::string &model_path) {
+        LOG_INFO("loading model from '%s'", model_path.c_str());
+        if (!load_clip_from_file(model_path, false, "")) {
+            free_clip_params();
+            return false;
         }
+
+        if (!load_unet_from_file(model_path, false, "")) {
+            free_clip_params();
+            free_unet_params();
+            return false;
+        }
+
+        if (!load_vae_from_file(model_path, false, "")) {
+            free_clip_params();
+            free_unet_params();
+            free_vae_params();
+            return false;
+        }
+
         return true;
     }
 
-    bool is_using_v_parameterization_for_sd2(ggml_context* work_ctx) {
-        struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1);
+    void free_diffusions_params() {
+        free_clip_params();
+        LOG_INFO("free clip params");
+
+        free_unet_params();
+        LOG_INFO("free unet params");
+
+        free_vae_params();
+        LOG_INFO("free vae params");
+    }
+
+    bool load_taesd_from_file(const std::string &taesd_path) {
+        if (first_stage_model.params_buffer_size > 0) {
+            free_vae_params();
+        }
+        if (taesd_path.empty() || !tae_first_stage.load_from_file(taesd_path, backend)) {
+            return false;
+        }
+        use_tiny_autoencoder = true;
+        return true;
+    }
+
+    void free_taesd_params() {
+        if (tae_first_stage.params_buffer_size > 0) {
+            tae_first_stage.free_params_buffer();
+        }
+    }
+
+    bool is_using_v_parameterization_for_sd2(ggml_context *work_ctx) {
+        struct ggml_tensor *x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1);
         ggml_set_f32(x_t, 0.5);
-        struct ggml_tensor* c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1);
+        struct ggml_tensor *c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1);
         ggml_set_f32(c, 0.5);
 
-        struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1);                                     // [N, ]
-        struct ggml_tensor* t_emb     = new_timestep_embedding(work_ctx, NULL, timesteps, diffusion_model.model_channels);  // [N, model_channels]
+        struct ggml_tensor *timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32,
+                                                           1);                                     // [N, ]
+        struct ggml_tensor *t_emb = new_timestep_embedding(work_ctx, NULL, timesteps,
+                                                           diffusion_model.model_channels);  // [N, model_channels]
 
         int64_t t0 = ggml_time_ms();
         ggml_set_f32(timesteps, 999);
         set_timestep_embedding(timesteps, t_emb, diffusion_model.model_channels);
-        struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t);
+        struct ggml_tensor *out = ggml_dup_tensor(work_ctx, x_t);
         diffusion_model.alloc_compute_buffer(x_t, c, t_emb);
         diffusion_model.compute(out, n_threads, x_t, NULL, c, t_emb);
         diffusion_model.free_compute_buffer();
 
         double result = 0.f;
         {
-            float* vec_x   = (float*)x_t->data;
-            float* vec_out = (float*)out->data;
+            float *vec_x = (float *) x_t->data;
+            float *vec_out = (float *) out->data;
 
             int64_t n = ggml_nelements(out);
 
             for (int i = 0; i < n; i++) {
-                result += ((double)vec_out[i] - (double)vec_x[i]);
+                result += ((double) vec_out[i] - (double) vec_x[i]);
             }
             result /= n;
         }
@@ -350,9 +547,39 @@ class StableDiffusionGGML {
         return result < -1;
     }
 
-    void apply_lora(const std::string& lora_name, float multiplier) {
-        int64_t t0                 = ggml_time_ms();
-        std::string st_file_path   = path_join(lora_model_dir, lora_name + ".safetensors");
+    void apply_schedule() const {
+        float alphas_cumprod_tensor[TIMESTEPS];
+        calculate_alphas_cumprod(alphas_cumprod_tensor);
+        if (schedule != DEFAULT) {
+            switch (schedule) {
+                case DISCRETE:
+                    LOG_INFO("running with discrete schedule");
+                    denoiser->schedule = std::make_shared<DiscreteSchedule>();
+                    break;
+                case KARRAS:
+                    LOG_INFO("running with Karras schedule");
+                    denoiser->schedule = std::make_shared<KarrasSchedule>();
+                    break;
+                case DEFAULT:
+                    // Don't touch anything.
+                    break;
+                default:
+                    LOG_ERROR("Unknown schedule %i", schedule);
+                    abort();
+            }
+        }
+
+        for (int i = 0; i < TIMESTEPS; i++) {
+            denoiser->schedule->alphas_cumprod[i] = alphas_cumprod_tensor[i];
+            denoiser->schedule->sigmas[i] = std::sqrt(
+                    (1 - denoiser->schedule->alphas_cumprod[i]) / denoiser->schedule->alphas_cumprod[i]);
+            denoiser->schedule->log_sigmas[i] = std::log(denoiser->schedule->sigmas[i]);
+        }
+    }
+
+    void apply_lora(const std::string &lora_name, float multiplier) {
+        int64_t t0 = ggml_time_ms();
+        std::string st_file_path = path_join(lora_model_dir, lora_name + ".safetensors");
         std::string ckpt_file_path = path_join(lora_model_dir, lora_name + ".ckpt");
         std::string file_path;
         if (file_exists(st_file_path)) {
@@ -360,7 +587,8 @@ class StableDiffusionGGML {
         } else if (file_exists(ckpt_file_path)) {
             file_path = ckpt_file_path;
         } else {
-            LOG_WARN("can not find %s or %s for lora %s", st_file_path.c_str(), ckpt_file_path.c_str(), lora_name.c_str());
+            LOG_WARN("can not find %s or %s for lora %s", st_file_path.c_str(), ckpt_file_path.c_str(),
+                     lora_name.c_str());
             return;
         }
         LoraModel lora(file_path);
@@ -381,14 +609,14 @@ class StableDiffusionGGML {
                  (t1 - t0) * 1.0f / 1000);
     }
 
-    void apply_loras(const std::unordered_map<std::string, float>& lora_state) {
+    void apply_loras(const std::unordered_map<std::string, float> &lora_state) {
         if (lora_state.size() > 0 && model_data_type != GGML_TYPE_F16 && model_data_type != GGML_TYPE_F32) {
             LOG_WARN("In quantized models when applying LoRA, the images have poor quality.");
         }
         std::unordered_map<std::string, float> lora_state_diff;
-        for (auto& kv : lora_state) {
-            const std::string& lora_name = kv.first;
-            float multiplier             = kv.second;
+        for (auto &kv: lora_state) {
+            const std::string &lora_name = kv.first;
+            float multiplier = kv.second;
 
             if (curr_lora_state.find(lora_name) != curr_lora_state.end()) {
                 float curr_multiplier = curr_lora_state[lora_name];
@@ -401,35 +629,35 @@ class StableDiffusionGGML {
             }
         }
 
-        for (auto& kv : lora_state_diff) {
+        for (auto &kv: lora_state_diff) {
             apply_lora(kv.first, kv.second);
         }
 
         curr_lora_state = lora_state;
     }
 
-    std::pair<ggml_tensor*, ggml_tensor*> get_learned_condition(ggml_context* work_ctx,
-                                                                const std::string& text,
-                                                                int clip_skip,
-                                                                int width,
-                                                                int height,
-                                                                bool force_zero_embeddings = false) {
+    std::pair<ggml_tensor *, ggml_tensor *> get_learned_condition(ggml_context *work_ctx,
+                                                                  const std::string &text,
+                                                                  int clip_skip,
+                                                                  int width,
+                                                                  int height,
+                                                                  bool force_zero_embeddings = false) {
         cond_stage_model.set_clip_skip(clip_skip);
-        auto tokens_and_weights     = cond_stage_model.tokenize(text, true);
-        std::vector<int>& tokens    = tokens_and_weights.first;
-        std::vector<float>& weights = tokens_and_weights.second;
-        int64_t t0                  = ggml_time_ms();
-        struct ggml_tensor* pooled  = NULL;
-        size_t total_hidden_size    = cond_stage_model.text_model.hidden_size;
+        auto tokens_and_weights = cond_stage_model.tokenize(text, true);
+        std::vector<int> &tokens = tokens_and_weights.first;
+        std::vector<float> &weights = tokens_and_weights.second;
+        int64_t t0 = ggml_time_ms();
+        struct ggml_tensor *pooled = NULL;
+        size_t total_hidden_size = cond_stage_model.text_model.hidden_size;
         if (version == VERSION_XL) {
             total_hidden_size += cond_stage_model.text_model2.hidden_size;
             pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, cond_stage_model.text_model2.projection_dim);
         }
-        struct ggml_tensor* hidden_states = ggml_new_tensor_2d(work_ctx,
+        struct ggml_tensor *hidden_states = ggml_new_tensor_2d(work_ctx,
                                                                GGML_TYPE_F32,
                                                                total_hidden_size,
                                                                cond_stage_model.text_model.max_position_embeddings);  // [N, n_token, hidden_size]
-        cond_stage_model.alloc_compute_buffer(work_ctx, (int)tokens.size());
+        cond_stage_model.alloc_compute_buffer(work_ctx, (int) tokens.size());
         cond_stage_model.compute(n_threads, tokens, hidden_states, pooled);
         cond_stage_model.free_compute_buffer();
         // if (pooled != NULL) {
@@ -439,7 +667,7 @@ class StableDiffusionGGML {
 
         int64_t t1 = ggml_time_ms();
         LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
-        ggml_tensor* result = ggml_dup_tensor(work_ctx, hidden_states);
+        ggml_tensor *result = ggml_dup_tensor(work_ctx, hidden_states);
         {
             float original_mean = ggml_tensor_mean(hidden_states);
             for (int i2 = 0; i2 < hidden_states->ne[2]; i2++) {
@@ -455,33 +683,34 @@ class StableDiffusionGGML {
             ggml_tensor_scale(result, (original_mean / new_mean));
         }
         if (force_zero_embeddings) {
-            float* vec = (float*)result->data;
+            float *vec = (float *) result->data;
             for (int i = 0; i < ggml_nelements(result); i++) {
                 vec[i] = 0;
             }
         }
 
-        ggml_tensor* vec = NULL;
+        ggml_tensor *vec = NULL;
         if (version == VERSION_XL) {
             int out_dim = 256;
-            vec         = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model.adm_in_channels);
+            vec = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model.adm_in_channels);
             // [0:1280]
             size_t offset = 0;
             memcpy(vec->data, pooled->data, ggml_nbytes(pooled));
             offset += ggml_nbytes(pooled);
 
-            struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 2);
+            struct ggml_tensor *timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 2);
             // original_size_as_tuple
-            float orig_width  = (float)width;
-            float orig_height = (float)height;
+            float orig_width = (float) width;
+            float orig_height = (float) height;
             ggml_tensor_set_f32(timesteps, orig_height, 0);
             ggml_tensor_set_f32(timesteps, orig_width, 1);
-            ggml_tensor* embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset);
+            ggml_tensor *embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim,
+                                                   offset);
             offset += ggml_nbytes(embed_view);
             set_timestep_embedding(timesteps, embed_view, out_dim);
             // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
             // crop_coords_top_left
-            float crop_coord_top  = 0.f;
+            float crop_coord_top = 0.f;
             float crop_coord_left = 0.f;
             ggml_tensor_set_f32(timesteps, crop_coord_top, 0);
             ggml_tensor_set_f32(timesteps, crop_coord_left, 1);
@@ -490,8 +719,8 @@ class StableDiffusionGGML {
             set_timestep_embedding(timesteps, embed_view, out_dim);
             // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
             // target_size_as_tuple
-            float target_width  = (float)width;
-            float target_height = (float)height;
+            float target_width = (float) width;
+            float target_height = (float) height;
             ggml_tensor_set_f32(timesteps, target_height, 0);
             ggml_tensor_set_f32(timesteps, target_width, 1);
             embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset);
@@ -504,25 +733,27 @@ class StableDiffusionGGML {
         return {result, vec};
     }
 
-    ggml_tensor* sample(ggml_context* work_ctx,
-                        ggml_tensor* x_t,
-                        ggml_tensor* noise,
-                        ggml_tensor* c,
-                        ggml_tensor* c_vector,
-                        ggml_tensor* uc,
-                        ggml_tensor* uc_vector,
+    ggml_tensor *sample(ggml_context *work_ctx,
+                        ggml_tensor *x_t,
+                        ggml_tensor *noise,
+                        ggml_tensor *c,
+                        ggml_tensor *c_vector,
+                        ggml_tensor *uc,
+                        ggml_tensor *uc_vector,
                         float cfg_scale,
                         sample_method_t method,
-                        const std::vector<float>& sigmas) {
+                        const std::vector<float> &sigmas) {
         size_t steps = sigmas.size() - 1;
         // x_t = load_tensor_from_file(work_ctx, "./rand0.bin");
         // print_ggml_tensor(x_t);
-        struct ggml_tensor* x = ggml_dup_tensor(work_ctx, x_t);
+        struct ggml_tensor *x = ggml_dup_tensor(work_ctx, x_t);
         copy_ggml_tensor(x, x_t);
 
-        struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, x_t);
-        struct ggml_tensor* timesteps    = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1);                                     // [N, ]
-        struct ggml_tensor* t_emb        = new_timestep_embedding(work_ctx, NULL, timesteps, diffusion_model.model_channels);  // [N, model_channels]
+        struct ggml_tensor *noised_input = ggml_dup_tensor(work_ctx, x_t);
+        struct ggml_tensor *timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32,
+                                                           1);                                     // [N, ]
+        struct ggml_tensor *t_emb = new_timestep_embedding(work_ctx, NULL, timesteps,
+                                                           diffusion_model.model_channels);  // [N, model_channels]
         diffusion_model.alloc_compute_buffer(noised_input, c, t_emb, c_vector);
 
         bool has_unconditioned = cfg_scale != 1.0 && uc != NULL;
@@ -537,31 +768,31 @@ class StableDiffusionGGML {
         }
 
         // denoise wrapper
-        struct ggml_tensor* out_cond   = ggml_dup_tensor(work_ctx, x);
-        struct ggml_tensor* out_uncond = NULL;
+        struct ggml_tensor *out_cond = ggml_dup_tensor(work_ctx, x);
+        struct ggml_tensor *out_uncond = NULL;
         if (has_unconditioned) {
             out_uncond = ggml_dup_tensor(work_ctx, x);
         }
-        struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);
+        struct ggml_tensor *denoised = ggml_dup_tensor(work_ctx, x);
 
-        auto denoise = [&](ggml_tensor* input, float sigma, int step) {
+        auto denoise = [&](ggml_tensor *input, float sigma, int step) {
             if (step == 1) {
-                pretty_progress(0, (int)steps, 0);
+                pretty_progress(0, (int) steps, 0);
             }
             int64_t t0 = ggml_time_us();
 
-            float c_skip               = 1.0f;
-            float c_out                = 1.0f;
-            float c_in                 = 1.0f;
+            float c_skip = 1.0f;
+            float c_out = 1.0f;
+            float c_in = 1.0f;
             std::vector<float> scaling = denoiser->get_scalings(sigma);
 
             if (scaling.size() == 3) {  // CompVisVDenoiser
                 c_skip = scaling[0];
-                c_out  = scaling[1];
-                c_in   = scaling[2];
+                c_out = scaling[1];
+                c_in = scaling[2];
             } else {  // CompVisDenoiser
                 c_out = scaling[0];
-                c_in  = scaling[1];
+                c_in = scaling[1];
             }
 
             float t = denoiser->schedule->sigma_to_t(sigma);
@@ -575,16 +806,16 @@ class StableDiffusionGGML {
             // cond
             diffusion_model.compute(out_cond, n_threads, noised_input, NULL, c, t_emb, c_vector);
 
-            float* negative_data = NULL;
+            float *negative_data = NULL;
             if (has_unconditioned) {
                 // uncond
                 diffusion_model.compute(out_uncond, n_threads, noised_input, NULL, uc, t_emb, uc_vector);
-                negative_data = (float*)out_uncond->data;
+                negative_data = (float *) out_uncond->data;
             }
-            float* vec_denoised  = (float*)denoised->data;
-            float* vec_input     = (float*)input->data;
-            float* positive_data = (float*)out_cond->data;
-            int ne_elements      = (int)ggml_nelements(denoised);
+            float *vec_denoised = (float *) denoised->data;
+            float *vec_input = (float *) input->data;
+            float *positive_data = (float *) out_cond->data;
+            int ne_elements = (int) ggml_nelements(denoised);
             for (int i = 0; i < ne_elements; i++) {
                 float latent_result = positive_data[i];
                 if (has_unconditioned) {
@@ -597,7 +828,7 @@ class StableDiffusionGGML {
             }
             int64_t t1 = ggml_time_us();
             if (step > 0) {
-                pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f);
+                pretty_progress(step, (int) steps, (t1 - t0) / 1000000.f);
                 // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
             }
         };
@@ -605,8 +836,8 @@ class StableDiffusionGGML {
         // sample_euler_ancestral
         switch (method) {
             case EULER_A: {
-                struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
-                struct ggml_tensor* d     = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x);
 
                 for (int i = 0; i < steps; i++) {
                     float sigma = sigmas[i];
@@ -616,9 +847,9 @@ class StableDiffusionGGML {
 
                     // d = (x - denoised) / sigma
                     {
-                        float* vec_d        = (float*)d->data;
-                        float* vec_x        = (float*)x->data;
-                        float* vec_denoised = (float*)denoised->data;
+                        float *vec_d = (float *) d->data;
+                        float *vec_x = (float *) x->data;
+                        float *vec_denoised = (float *) denoised->data;
 
                         for (int i = 0; i < ggml_nelements(d); i++) {
                             vec_d[i] = (vec_x[i] - vec_denoised[i]) / sigma;
@@ -626,16 +857,18 @@ class StableDiffusionGGML {
                     }
 
                     // get_ancestral_step
-                    float sigma_up   = std::min(sigmas[i + 1],
-                                                std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i])));
+                    float sigma_up = std::min(sigmas[i + 1],
+                                              std::sqrt(sigmas[i + 1] * sigmas[i + 1] *
+                                                        (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) /
+                                                        (sigmas[i] * sigmas[i])));
                     float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up);
 
                     // Euler method
                     float dt = sigma_down - sigmas[i];
                     // x = x + d * dt
                     {
-                        float* vec_d = (float*)d->data;
-                        float* vec_x = (float*)x->data;
+                        float *vec_d = (float *) d->data;
+                        float *vec_x = (float *) x->data;
 
                         for (int i = 0; i < ggml_nelements(x); i++) {
                             vec_x[i] = vec_x[i] + vec_d[i] * dt;
@@ -647,8 +880,8 @@ class StableDiffusionGGML {
                         ggml_tensor_set_f32_randn(noise, rng);
                         // noise = load_tensor_from_file(work_ctx, "./rand" + std::to_string(i+1) + ".bin");
                         {
-                            float* vec_x     = (float*)x->data;
-                            float* vec_noise = (float*)noise->data;
+                            float *vec_x = (float *) x->data;
+                            float *vec_noise = (float *) noise->data;
 
                             for (int i = 0; i < ggml_nelements(x); i++) {
                                 vec_x[i] = vec_x[i] + vec_noise[i] * sigma_up;
@@ -656,10 +889,11 @@ class StableDiffusionGGML {
                         }
                     }
                 }
-            } break;
+            }
+                break;
             case EULER:  // Implemented without any sigma churn
             {
-                struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x);
 
                 for (int i = 0; i < steps; i++) {
                     float sigma = sigmas[i];
@@ -669,9 +903,9 @@ class StableDiffusionGGML {
 
                     // d = (x - denoised) / sigma
                     {
-                        float* vec_d        = (float*)d->data;
-                        float* vec_x        = (float*)x->data;
-                        float* vec_denoised = (float*)denoised->data;
+                        float *vec_d = (float *) d->data;
+                        float *vec_x = (float *) x->data;
+                        float *vec_denoised = (float *) denoised->data;
 
                         for (int j = 0; j < ggml_nelements(d); j++) {
                             vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigma;
@@ -681,18 +915,19 @@ class StableDiffusionGGML {
                     float dt = sigmas[i + 1] - sigma;
                     // x = x + d * dt
                     {
-                        float* vec_d = (float*)d->data;
-                        float* vec_x = (float*)x->data;
+                        float *vec_d = (float *) d->data;
+                        float *vec_x = (float *) x->data;
 
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             vec_x[j] = vec_x[j] + vec_d[j] * dt;
                         }
                     }
                 }
-            } break;
+            }
+                break;
             case HEUN: {
-                struct ggml_tensor* d  = ggml_dup_tensor(work_ctx, x);
-                struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor *x2 = ggml_dup_tensor(work_ctx, x);
 
                 for (int i = 0; i < steps; i++) {
                     // denoise
@@ -700,9 +935,9 @@ class StableDiffusionGGML {
 
                     // d = (x - denoised) / sigma
                     {
-                        float* vec_d        = (float*)d->data;
-                        float* vec_x        = (float*)x->data;
-                        float* vec_denoised = (float*)denoised->data;
+                        float *vec_d = (float *) d->data;
+                        float *vec_x = (float *) x->data;
+                        float *vec_denoised = (float *) denoised->data;
 
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i];
@@ -713,25 +948,25 @@ class StableDiffusionGGML {
                     if (sigmas[i + 1] == 0) {
                         // Euler step
                         // x = x + d * dt
-                        float* vec_d = (float*)d->data;
-                        float* vec_x = (float*)x->data;
+                        float *vec_d = (float *) d->data;
+                        float *vec_x = (float *) x->data;
 
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             vec_x[j] = vec_x[j] + vec_d[j] * dt;
                         }
                     } else {
                         // Heun step
-                        float* vec_d  = (float*)d->data;
-                        float* vec_d2 = (float*)d->data;
-                        float* vec_x  = (float*)x->data;
-                        float* vec_x2 = (float*)x2->data;
+                        float *vec_d = (float *) d->data;
+                        float *vec_d2 = (float *) d->data;
+                        float *vec_x = (float *) x->data;
+                        float *vec_x2 = (float *) x2->data;
 
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             vec_x2[j] = vec_x[j] + vec_d[j] * dt;
                         }
 
                         denoise(x2, sigmas[i + 1], i + 1);
-                        float* vec_denoised = (float*)denoised->data;
+                        float *vec_denoised = (float *) denoised->data;
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             float d2 = (vec_x2[j] - vec_denoised[j]) / sigmas[i + 1];
                             vec_d[j] = (vec_d[j] + d2) / 2;
@@ -739,10 +974,11 @@ class StableDiffusionGGML {
                         }
                     }
                 }
-            } break;
+            }
+                break;
             case DPM2: {
-                struct ggml_tensor* d  = ggml_dup_tensor(work_ctx, x);
-                struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor *x2 = ggml_dup_tensor(work_ctx, x);
 
                 for (int i = 0; i < steps; i++) {
                     // denoise
@@ -750,9 +986,9 @@ class StableDiffusionGGML {
 
                     // d = (x - denoised) / sigma
                     {
-                        float* vec_d        = (float*)d->data;
-                        float* vec_x        = (float*)x->data;
-                        float* vec_denoised = (float*)denoised->data;
+                        float *vec_d = (float *) d->data;
+                        float *vec_x = (float *) x->data;
+                        float *vec_denoised = (float *) denoised->data;
 
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i];
@@ -762,9 +998,9 @@ class StableDiffusionGGML {
                     if (sigmas[i + 1] == 0) {
                         // Euler step
                         // x = x + d * dt
-                        float dt     = sigmas[i + 1] - sigmas[i];
-                        float* vec_d = (float*)d->data;
-                        float* vec_x = (float*)x->data;
+                        float dt = sigmas[i + 1] - sigmas[i];
+                        float *vec_d = (float *) d->data;
+                        float *vec_x = (float *) x->data;
 
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             vec_x[j] = vec_x[j] + vec_d[j] * dt;
@@ -772,18 +1008,18 @@ class StableDiffusionGGML {
                     } else {
                         // DPM-Solver-2
                         float sigma_mid = exp(0.5f * (log(sigmas[i]) + log(sigmas[i + 1])));
-                        float dt_1      = sigma_mid - sigmas[i];
-                        float dt_2      = sigmas[i + 1] - sigmas[i];
+                        float dt_1 = sigma_mid - sigmas[i];
+                        float dt_2 = sigmas[i + 1] - sigmas[i];
 
-                        float* vec_d  = (float*)d->data;
-                        float* vec_x  = (float*)x->data;
-                        float* vec_x2 = (float*)x2->data;
+                        float *vec_d = (float *) d->data;
+                        float *vec_x = (float *) x->data;
+                        float *vec_x2 = (float *) x2->data;
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             vec_x2[j] = vec_x[j] + vec_d[j] * dt_1;
                         }
 
                         denoise(x2, sigma_mid, i + 1);
-                        float* vec_denoised = (float*)denoised->data;
+                        float *vec_denoised = (float *) denoised->data;
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             float d2 = (vec_x2[j] - vec_denoised[j]) / sigma_mid;
                             vec_x[j] = vec_x[j] + d2 * dt_2;
@@ -791,28 +1027,31 @@ class StableDiffusionGGML {
                     }
                 }
 
-            } break;
+            }
+                break;
             case DPMPP2S_A: {
-                struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
-                struct ggml_tensor* d     = ggml_dup_tensor(work_ctx, x);
-                struct ggml_tensor* x2    = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor *x2 = ggml_dup_tensor(work_ctx, x);
 
                 for (int i = 0; i < steps; i++) {
                     // denoise
                     denoise(x, sigmas[i], i + 1);
 
                     // get_ancestral_step
-                    float sigma_up   = std::min(sigmas[i + 1],
-                                                std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i])));
+                    float sigma_up = std::min(sigmas[i + 1],
+                                              std::sqrt(sigmas[i + 1] * sigmas[i + 1] *
+                                                        (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) /
+                                                        (sigmas[i] * sigmas[i])));
                     float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up);
-                    auto t_fn        = [](float sigma) -> float { return -log(sigma); };
-                    auto sigma_fn    = [](float t) -> float { return exp(-t); };
+                    auto t_fn = [](float sigma) -> float { return -log(sigma); };
+                    auto sigma_fn = [](float t) -> float { return exp(-t); };
 
                     if (sigma_down == 0) {
                         // Euler step
-                        float* vec_d        = (float*)d->data;
-                        float* vec_x        = (float*)x->data;
-                        float* vec_denoised = (float*)denoised->data;
+                        float *vec_d = (float *) d->data;
+                        float *vec_x = (float *) x->data;
+                        float *vec_denoised = (float *) denoised->data;
 
                         for (int j = 0; j < ggml_nelements(d); j++) {
                             vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i];
@@ -828,15 +1067,15 @@ class StableDiffusionGGML {
                         }
                     } else {
                         // DPM-Solver++(2S)
-                        float t      = t_fn(sigmas[i]);
+                        float t = t_fn(sigmas[i]);
                         float t_next = t_fn(sigma_down);
-                        float h      = t_next - t;
-                        float s      = t + 0.5f * h;
+                        float h = t_next - t;
+                        float s = t + 0.5f * h;
 
-                        float* vec_d        = (float*)d->data;
-                        float* vec_x        = (float*)x->data;
-                        float* vec_x2       = (float*)x2->data;
-                        float* vec_denoised = (float*)denoised->data;
+                        float *vec_d = (float *) d->data;
+                        float *vec_x = (float *) x->data;
+                        float *vec_x2 = (float *) x2->data;
+                        float *vec_denoised = (float *) denoised->data;
 
                         // First half-step
                         for (int j = 0; j < ggml_nelements(x); j++) {
@@ -855,8 +1094,8 @@ class StableDiffusionGGML {
                     if (sigmas[i + 1] > 0) {
                         ggml_tensor_set_f32_randn(noise, rng);
                         {
-                            float* vec_x     = (float*)x->data;
-                            float* vec_noise = (float*)noise->data;
+                            float *vec_x = (float *) x->data;
+                            float *vec_noise = (float *) noise->data;
 
                             for (int i = 0; i < ggml_nelements(x); i++) {
                                 vec_x[i] = vec_x[i] + vec_noise[i] * sigma_up;
@@ -864,10 +1103,11 @@ class StableDiffusionGGML {
                         }
                     }
                 }
-            } break;
+            }
+                break;
             case DPMPP2M:  // DPM++ (2M) from Karras et al (2022)
             {
-                struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor *old_denoised = ggml_dup_tensor(work_ctx, x);
 
                 auto t_fn = [](float sigma) -> float { return -log(sigma); };
 
@@ -875,14 +1115,14 @@ class StableDiffusionGGML {
                     // denoise
                     denoise(x, sigmas[i], i + 1);
 
-                    float t                 = t_fn(sigmas[i]);
-                    float t_next            = t_fn(sigmas[i + 1]);
-                    float h                 = t_next - t;
-                    float a                 = sigmas[i + 1] / sigmas[i];
-                    float b                 = exp(-h) - 1.f;
-                    float* vec_x            = (float*)x->data;
-                    float* vec_denoised     = (float*)denoised->data;
-                    float* vec_old_denoised = (float*)old_denoised->data;
+                    float t = t_fn(sigmas[i]);
+                    float t_next = t_fn(sigmas[i + 1]);
+                    float h = t_next - t;
+                    float a = sigmas[i + 1] / sigmas[i];
+                    float b = exp(-h) - 1.f;
+                    float *vec_x = (float *) x->data;
+                    float *vec_denoised = (float *) denoised->data;
+                    float *vec_old_denoised = (float *) old_denoised->data;
 
                     if (i == 0 || sigmas[i + 1] == 0) {
                         // Simpler step for the edge cases
@@ -891,10 +1131,11 @@ class StableDiffusionGGML {
                         }
                     } else {
                         float h_last = t - t_fn(sigmas[i - 1]);
-                        float r      = h_last / h;
+                        float r = h_last / h;
                         for (int j = 0; j < ggml_nelements(x); j++) {
-                            float denoised_d = (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j];
-                            vec_x[j]         = a * vec_x[j] - b * denoised_d;
+                            float denoised_d =
+                                    (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j];
+                            vec_x[j] = a * vec_x[j] - b * denoised_d;
                         }
                     }
 
@@ -903,10 +1144,11 @@ class StableDiffusionGGML {
                         vec_old_denoised[j] = vec_denoised[j];
                     }
                 }
-            } break;
+            }
+                break;
             case DPMPP2Mv2:  // Modified DPM++ (2M) from https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457
             {
-                struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor *old_denoised = ggml_dup_tensor(work_ctx, x);
 
                 auto t_fn = [](float sigma) -> float { return -log(sigma); };
 
@@ -914,13 +1156,13 @@ class StableDiffusionGGML {
                     // denoise
                     denoise(x, sigmas[i], i + 1);
 
-                    float t                 = t_fn(sigmas[i]);
-                    float t_next            = t_fn(sigmas[i + 1]);
-                    float h                 = t_next - t;
-                    float a                 = sigmas[i + 1] / sigmas[i];
-                    float* vec_x            = (float*)x->data;
-                    float* vec_denoised     = (float*)denoised->data;
-                    float* vec_old_denoised = (float*)old_denoised->data;
+                    float t = t_fn(sigmas[i]);
+                    float t_next = t_fn(sigmas[i + 1]);
+                    float h = t_next - t;
+                    float a = sigmas[i + 1] / sigmas[i];
+                    float *vec_x = (float *) x->data;
+                    float *vec_denoised = (float *) denoised->data;
+                    float *vec_old_denoised = (float *) old_denoised->data;
 
                     if (i == 0 || sigmas[i + 1] == 0) {
                         // Simpler step for the edge cases
@@ -930,14 +1172,15 @@ class StableDiffusionGGML {
                         }
                     } else {
                         float h_last = t - t_fn(sigmas[i - 1]);
-                        float h_min  = std::min(h_last, h);
-                        float h_max  = std::max(h_last, h);
-                        float r      = h_max / h_min;
-                        float h_d    = (h_max + h_min) / 2.f;
-                        float b      = exp(-h_d) - 1.f;
+                        float h_min = std::min(h_last, h);
+                        float h_max = std::max(h_last, h);
+                        float r = h_max / h_min;
+                        float h_d = (h_max + h_min) / 2.f;
+                        float b = exp(-h_d) - 1.f;
                         for (int j = 0; j < ggml_nelements(x); j++) {
-                            float denoised_d = (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j];
-                            vec_x[j]         = a * vec_x[j] - b * denoised_d;
+                            float denoised_d =
+                                    (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j];
+                            vec_x[j] = a * vec_x[j] - b * denoised_d;
                         }
                     }
 
@@ -946,11 +1189,12 @@ class StableDiffusionGGML {
                         vec_old_denoised[j] = vec_denoised[j];
                     }
                 }
-            } break;
+            }
+                break;
             case LCM:  // Latent Consistency Models
             {
-                struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
-                struct ggml_tensor* d     = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x);
 
                 for (int i = 0; i < steps; i++) {
                     float sigma = sigmas[i];
@@ -960,8 +1204,8 @@ class StableDiffusionGGML {
 
                     // x = denoised
                     {
-                        float* vec_x        = (float*)x->data;
-                        float* vec_denoised = (float*)denoised->data;
+                        float *vec_x = (float *) x->data;
+                        float *vec_denoised = (float *) denoised->data;
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             vec_x[j] = vec_denoised[j];
                         }
@@ -972,8 +1216,8 @@ class StableDiffusionGGML {
                         ggml_tensor_set_f32_randn(noise, rng);
                         // noise = load_tensor_from_file(res_ctx, "./rand" + std::to_string(i+1) + ".bin");
                         {
-                            float* vec_x     = (float*)x->data;
-                            float* vec_noise = (float*)noise->data;
+                            float *vec_x = (float *) x->data;
+                            float *vec_noise = (float *) noise->data;
 
                             for (int j = 0; j < ggml_nelements(x); j++) {
                                 vec_x[j] = vec_x[j] + sigmas[i + 1] * vec_noise[j];
@@ -981,7 +1225,8 @@ class StableDiffusionGGML {
                         }
                     }
                 }
-            } break;
+            }
+                break;
 
             default:
                 LOG_ERROR("Attempting to sample with nonexisting sample method %i", method);
@@ -992,27 +1237,28 @@ class StableDiffusionGGML {
     }
 
     // ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding
-    ggml_tensor* get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) {
+    ggml_tensor *get_first_stage_encoding(ggml_context *work_ctx, ggml_tensor *moments) {
         // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample
-        ggml_tensor* latent       = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]);
-        struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latent);
+        ggml_tensor *latent = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1],
+                                                 moments->ne[2] / 2, moments->ne[3]);
+        struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, latent);
         ggml_tensor_set_f32_randn(noise, rng);
         // noise = load_tensor_from_file(work_ctx, "noise.bin");
         {
-            float mean   = 0;
+            float mean = 0;
             float logvar = 0;
-            float value  = 0;
-            float std_   = 0;
+            float value = 0;
+            float std_ = 0;
             for (int i = 0; i < latent->ne[3]; i++) {
                 for (int j = 0; j < latent->ne[2]; j++) {
                     for (int k = 0; k < latent->ne[1]; k++) {
                         for (int l = 0; l < latent->ne[0]; l++) {
-                            mean   = ggml_tensor_get_f32(moments, l, k, j, i);
-                            logvar = ggml_tensor_get_f32(moments, l, k, j + (int)latent->ne[2], i);
+                            mean = ggml_tensor_get_f32(moments, l, k, j, i);
+                            logvar = ggml_tensor_get_f32(moments, l, k, j + (int) latent->ne[2], i);
                             logvar = std::max(-30.0f, std::min(logvar, 20.0f));
-                            std_   = std::exp(0.5f * logvar);
-                            value  = mean + std_ * ggml_tensor_get_f32(noise, l, k, j, i);
-                            value  = value * scale_factor;
+                            std_ = std::exp(0.5f * logvar);
+                            value = mean + std_ * ggml_tensor_get_f32(noise, l, k, j, i);
+                            value = value * scale_factor;
                             // printf("%d %d %d %d -> %f\n", i, j, k, l, value);
                             ggml_tensor_set_f32(latent, value, l, k, j, i);
                         }
@@ -1023,14 +1269,14 @@ class StableDiffusionGGML {
         return latent;
     }
 
-    ggml_tensor* compute_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode) {
-        int64_t W           = x->ne[0];
-        int64_t H           = x->ne[1];
-        ggml_tensor* result = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32,
+    ggml_tensor *compute_first_stage(ggml_context *work_ctx, ggml_tensor *x, bool decode) {
+        int64_t W = x->ne[0];
+        int64_t H = x->ne[1];
+        ggml_tensor *result = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32,
                                                  decode ? (W * 8) : (W / 8),                    // width
                                                  decode ? (H * 8) : (H / 8),                    // height
                                                  decode ? 3 : (use_tiny_autoencoder ? 4 : 8));  // channels
-        int64_t t0          = ggml_time_ms();
+        int64_t t0 = ggml_time_ms();
         if (!use_tiny_autoencoder) {
             if (decode) {
                 ggml_tensor_scale(x, 1.0f / scale_factor);
@@ -1039,7 +1285,7 @@ class StableDiffusionGGML {
             }
             if (vae_tiling && decode) {  // TODO: support tiling vae encode
                 // split latent in 32x32 tiles and compute in several steps
-                auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
+                auto on_tiling = [&](ggml_tensor *in, ggml_tensor *out, bool init) {
                     if (init) {
                         first_stage_model.alloc_compute_buffer(in, decode);
                     } else {
@@ -1058,7 +1304,7 @@ class StableDiffusionGGML {
         } else {
             if (vae_tiling && decode) {  // TODO: support tiling vae encode
                 // split latent in 64x64 tiles and compute in several steps
-                auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
+                auto on_tiling = [&](ggml_tensor *in, ggml_tensor *out, bool init) {
                     if (init) {
                         tae_first_stage.alloc_compute_buffer(in, decode);
                     } else {
@@ -1073,18 +1319,19 @@ class StableDiffusionGGML {
             tae_first_stage.free_compute_buffer();
         }
         int64_t t1 = ggml_time_ms();
-        LOG_DEBUG("computing vae [mode: %s] graph completed, taking %.2fs", decode ? "DECODE" : "ENCODE", (t1 - t0) * 1.0f / 1000);
+        LOG_DEBUG("computing vae [mode: %s] graph completed, taking %.2fs", decode ? "DECODE" : "ENCODE",
+                  (t1 - t0) * 1.0f / 1000);
         if (decode) {
             ggml_tensor_clamp(result, 0.0f, 1.0f);
         }
         return result;
     }
 
-    ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x) {
+    ggml_tensor *encode_first_stage(ggml_context *work_ctx, ggml_tensor *x) {
         return compute_first_stage(work_ctx, x, false);
     }
 
-    ggml_tensor* decode_first_stage(ggml_context* work_ctx, ggml_tensor* x) {
+    ggml_tensor *decode_first_stage(ggml_context *work_ctx, ggml_tensor *x) {
         return compute_first_stage(work_ctx, x, true);
     }
 };
@@ -1092,53 +1339,38 @@ class StableDiffusionGGML {
 /*================================================= SD API ==================================================*/
 
 struct sd_ctx_t {
-    StableDiffusionGGML* sd = NULL;
+    StableDiffusionGGML *sd = NULL;
 };
 
-sd_ctx_t* new_sd_ctx(const char* model_path_c_str,
-                     const char* vae_path_c_str,
-                     const char* taesd_path_c_str,
-                     const char* lora_model_dir_c_str,
+sd_ctx_t *new_sd_ctx(int n_threads,
                      bool vae_decode_only,
-                     bool vae_tiling,
                      bool free_params_immediately,
-                     int n_threads,
-                     enum sd_type_t wtype,
+                     const char *lora_model_dir_c_str,
                      enum rng_type_t rng_type,
-                     enum schedule_t s) {
-    sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t));
+                     bool vae_tiling,
+                     enum sd_type_t wtype,
+                     enum schedule_t s,
+                     bool init_backend_immediately) {
+    sd_ctx_t *sd_ctx = (sd_ctx_t *) malloc(sizeof(sd_ctx_t));
     if (sd_ctx == NULL) {
         return NULL;
     }
-    std::string model_path(model_path_c_str);
-    std::string vae_path(vae_path_c_str);
-    std::string taesd_path(taesd_path_c_str);
     std::string lora_model_dir(lora_model_dir_c_str);
 
     sd_ctx->sd = new StableDiffusionGGML(n_threads,
                                          vae_decode_only,
                                          free_params_immediately,
                                          lora_model_dir,
-                                         rng_type);
-    if (sd_ctx->sd == NULL) {
-        return NULL;
-    }
-
-    if (!sd_ctx->sd->load_from_file(model_path,
-                                    vae_path,
-                                    taesd_path,
-                                    vae_tiling,
-                                    (ggml_type)wtype,
-                                    s)) {
-        delete sd_ctx->sd;
-        sd_ctx->sd = NULL;
-        free(sd_ctx);
-        return NULL;
-    }
+                                         rng_type,
+                                         vae_tiling,
+                                         static_cast<ggml_type>(wtype),
+                                         s,
+                                         init_backend_immediately
+    );
     return sd_ctx;
 }
 
-void free_sd_ctx(sd_ctx_t* sd_ctx) {
+void free_sd_ctx(sd_ctx_t *sd_ctx) {
     if (sd_ctx->sd != NULL) {
         delete sd_ctx->sd;
         sd_ctx->sd = NULL;
@@ -1146,9 +1378,125 @@ void free_sd_ctx(sd_ctx_t* sd_ctx) {
     free(sd_ctx);
 }
 
-sd_image_t* txt2img(sd_ctx_t* sd_ctx,
-                    const char* prompt_c_str,
-                    const char* negative_prompt_c_str,
+void init_backend(sd_ctx_t *sd_ctx) {
+    if (sd_ctx == NULL || sd_ctx->sd == NULL) {
+        LOG_ERROR("must call new_sd_ctx first");
+        return;
+    }
+    sd_ctx->sd->init_backend();
+}
+
+void set_options(sd_ctx_t *sd_ctx,
+                 int n_threads,
+                 bool vae_decode_only,
+                 bool free_params_immediately,
+                 const char *lora_model_dir,
+                 rng_type_t rng_type,
+                 bool vae_tiling,
+                 sd_type_t wtype,
+                 schedule_t schedule
+) {
+    if (sd_ctx == NULL || sd_ctx->sd == NULL) {
+        LOG_ERROR("must call new_sd_ctx first");
+        return;
+    }
+    sd_ctx->sd->set_options(
+            n_threads,
+            vae_decode_only,
+            free_params_immediately,
+            std::string(lora_model_dir),
+            rng_type,
+            vae_tiling,
+            wtype,
+            schedule
+    );
+}
+
+bool load_clip_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix) {
+    if (sd_ctx == NULL || sd_ctx->sd == NULL) {
+        LOG_ERROR("must call new_sd_ctx first");
+        return false;
+    }
+    return sd_ctx->sd->load_clip_from_file(std::string(model_path), true,std::string(prefix));
+}
+
+void free_clip_params(sd_ctx_t *sd_ctx) {
+    if (sd_ctx == NULL || sd_ctx->sd == NULL) {
+        LOG_ERROR("must call new_sd_ctx first");
+        return;
+    }
+    sd_ctx->sd->free_clip_params();
+}
+
+bool load_unet_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix) {
+    if (sd_ctx == NULL || sd_ctx->sd == NULL) {
+        LOG_ERROR("must call new_sd_ctx first");
+        return false;
+    }
+    return sd_ctx->sd->load_unet_from_file(std::string(model_path), true, std::string(prefix));
+}
+
+void free_unet_params(sd_ctx_t *sd_ctx) {
+    if (sd_ctx == NULL || sd_ctx->sd == NULL) {
+        LOG_ERROR("must call new_sd_ctx first");
+        return;
+    }
+    sd_ctx->sd->free_unet_params();
+}
+
+bool load_vae_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix) {
+    if (sd_ctx == NULL || sd_ctx->sd == NULL) {
+        LOG_ERROR("must call new_sd_ctx first");
+        return false;
+    }
+    return sd_ctx->sd->load_vae_from_file(std::string(model_path), true, std::string(prefix));
+}
+
+void free_vae_params(sd_ctx_t *sd_ctx) {
+    if (sd_ctx == NULL || sd_ctx->sd == NULL) {
+        LOG_ERROR("must call new_sd_ctx first");
+        return;
+    }
+    sd_ctx->sd->free_vae_params();
+}
+
+bool load_taesd_from_file(sd_ctx_t *sd_ctx, const char *model_path) {
+    if (sd_ctx == NULL || sd_ctx->sd == NULL) {
+        LOG_ERROR("must call new_sd_ctx first");
+        return false;
+    }
+    return sd_ctx->sd->load_taesd_from_file(std::string(model_path));
+}
+
+void free_taesd_params(sd_ctx_t *sd_ctx) {
+    if (sd_ctx == NULL || sd_ctx->sd == NULL) {
+        LOG_ERROR("must call new_sd_ctx first");
+        return;
+    }
+    sd_ctx->sd->free_taesd_params();
+}
+
+// load all model from one file
+bool load_diffusions_from_file(sd_ctx_t *sd_ctx, const char *model_path) {
+    if (sd_ctx == NULL || sd_ctx->sd == NULL) {
+        LOG_ERROR("must call new_sd_ctx first");
+        return false;
+    }
+    return sd_ctx->sd->load_diffusions_from_file(std::string(model_path));
+}
+
+// free all model from one file
+void free_diffusions_params(sd_ctx_t *sd_ctx) {
+    if (sd_ctx == NULL || sd_ctx->sd == NULL) {
+        LOG_ERROR("must call new_sd_ctx first");
+        return;
+    }
+    return sd_ctx->sd->free_diffusions_params();
+}
+
+sd_image_t *txt2img(sd_ctx_t *sd_ctx,
+                    const char *prompt_c_str,
+                    const char *negative_prompt_c_str,
                     int clip_skip,
                     float cfg_scale,
                     int width,
@@ -1166,10 +1514,10 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
     std::string negative_prompt(negative_prompt_c_str);
 
     // extract and remove lora
-    auto result_pair                                = extract_and_remove_lora(prompt);
+    auto result_pair = extract_and_remove_lora(prompt);
     std::unordered_map<std::string, float> lora_f2m = result_pair.first;  // lora_name -> multiplier
 
-    for (auto& kv : lora_f2m) {
+    for (auto &kv: lora_f2m) {
         LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second);
     }
 
@@ -1185,10 +1533,10 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
     params.mem_size += width * height * 3 * sizeof(float);
     params.mem_size *= batch_count;
     params.mem_buffer = NULL;
-    params.no_alloc   = false;
+    params.no_alloc = false;
     // LOG_DEBUG("mem_size %u ", params.mem_size);
 
-    struct ggml_context* work_ctx = ggml_init(params);
+    struct ggml_context *work_ctx = ggml_init(params);
     if (!work_ctx) {
         LOG_ERROR("ggml_init() failed");
         return NULL;
@@ -1198,24 +1546,25 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
         // Generally, when using the provided command line, the seed is always >0.
         // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
         // by a third party with a seed <0, let's incorporate randomization here.
-        srand((int)time(NULL));
+        srand((int) time(NULL));
         seed = rand();
     }
 
-    t0                            = ggml_time_ms();
-    auto cond_pair                = sd_ctx->sd->get_learned_condition(work_ctx, prompt, clip_skip, width, height);
-    ggml_tensor* c                = cond_pair.first;
-    ggml_tensor* c_vector         = cond_pair.second;  // [adm_in_channels, ]
-    struct ggml_tensor* uc        = NULL;
-    struct ggml_tensor* uc_vector = NULL;
+    t0 = ggml_time_ms();
+    auto cond_pair = sd_ctx->sd->get_learned_condition(work_ctx, prompt, clip_skip, width, height);
+    ggml_tensor *c = cond_pair.first;
+    ggml_tensor *c_vector = cond_pair.second;  // [adm_in_channels, ]
+    struct ggml_tensor *uc = NULL;
+    struct ggml_tensor *uc_vector = NULL;
     if (cfg_scale != 1.0) {
         bool force_zero_embeddings = false;
         if (sd_ctx->sd->version == VERSION_XL && negative_prompt.size() == 0) {
             force_zero_embeddings = true;
         }
-        auto uncond_pair = sd_ctx->sd->get_learned_condition(work_ctx, negative_prompt, clip_skip, width, height, force_zero_embeddings);
-        uc               = uncond_pair.first;
-        uc_vector        = uncond_pair.second;  // [adm_in_channels, ]
+        auto uncond_pair = sd_ctx->sd->get_learned_condition(work_ctx, negative_prompt, clip_skip, width, height,
+                                                             force_zero_embeddings);
+        uc = uncond_pair.first;
+        uc_vector = uncond_pair.second;  // [adm_in_channels, ]
     }
     t1 = ggml_time_ms();
     LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t1 - t0);
@@ -1224,23 +1573,24 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
         sd_ctx->sd->cond_stage_model.free_params_buffer();
     }
 
-    std::vector<struct ggml_tensor*> final_latents;  // collect latents to decode
+    std::vector<struct ggml_tensor *> final_latents;  // collect latents to decode
     int C = 4;
     int W = width / 8;
     int H = height / 8;
     LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
     for (int b = 0; b < batch_count; b++) {
         int64_t sampling_start = ggml_time_ms();
-        int64_t cur_seed       = seed + b;
+        int64_t cur_seed = seed + b;
         LOG_INFO("generating image: %i/%i - seed %i", b + 1, batch_count, cur_seed);
 
         sd_ctx->sd->rng->manual_seed(cur_seed);
-        struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
+        struct ggml_tensor *x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
         ggml_tensor_set_f32_randn(x_t, sd_ctx->sd->rng);
 
         std::vector<float> sigmas = sd_ctx->sd->denoiser->schedule->get_sigmas(sample_steps);
 
-        struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx, x_t, NULL, c, c_vector, uc, uc_vector, cfg_scale, sample_method, sigmas);
+        struct ggml_tensor *x_0 = sd_ctx->sd->sample(work_ctx, x_t, NULL, c, c_vector, uc, uc_vector, cfg_scale,
+                                                     sample_method, sigmas);
         // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
         // print_ggml_tensor(x_0);
         int64_t sampling_end = ggml_time_ms();
@@ -1252,13 +1602,14 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
         sd_ctx->sd->diffusion_model.free_params_buffer();
     }
     int64_t t3 = ggml_time_ms();
-    LOG_INFO("generating %" PRId64 " latent images completed, taking %.2fs", final_latents.size(), (t3 - t1) * 1.0f / 1000);
+    LOG_INFO("generating %" PRId64 " latent images completed, taking %.2fs", final_latents.size(),
+             (t3 - t1) * 1.0f / 1000);
 
     LOG_INFO("decoding %zu latents", final_latents.size());
-    std::vector<struct ggml_tensor*> decoded_images;  // collect decoded images
+    std::vector<struct ggml_tensor *> decoded_images;  // collect decoded images
     for (size_t i = 0; i < final_latents.size(); i++) {
-        t1                      = ggml_time_ms();
-        struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, final_latents[i] /* x_0 */);
+        t1 = ggml_time_ms();
+        struct ggml_tensor *img = sd_ctx->sd->decode_first_stage(work_ctx, final_latents[i] /* x_0 */);
         // print_ggml_tensor(img);
         if (img != NULL) {
             decoded_images.push_back(img);
@@ -1272,30 +1623,30 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
     if (sd_ctx->sd->free_params_immediately && !sd_ctx->sd->use_tiny_autoencoder) {
         sd_ctx->sd->first_stage_model.free_params_buffer();
     }
-    sd_image_t* result_images = (sd_image_t*)calloc(batch_count, sizeof(sd_image_t));
+    sd_image_t *result_images = (sd_image_t *) calloc(batch_count, sizeof(sd_image_t));
     if (result_images == NULL) {
         ggml_free(work_ctx);
         return NULL;
     }
 
     for (size_t i = 0; i < decoded_images.size(); i++) {
-        result_images[i].width   = width;
-        result_images[i].height  = height;
+        result_images[i].width = width;
+        result_images[i].height = height;
         result_images[i].channel = 3;
-        result_images[i].data    = sd_tensor_to_image(decoded_images[i]);
+        result_images[i].data = sd_tensor_to_image(decoded_images[i]);
     }
     ggml_free(work_ctx);
     LOG_INFO(
-        "txt2img completed in %.2fs",
-        (t4 - t0) * 1.0f / 1000);
+            "txt2img completed in %.2fs",
+            (t4 - t0) * 1.0f / 1000);
 
     return result_images;
 }
 
-sd_image_t* img2img(sd_ctx_t* sd_ctx,
+sd_image_t *img2img(sd_ctx_t *sd_ctx,
                     sd_image_t init_image,
-                    const char* prompt_c_str,
-                    const char* negative_prompt_c_str,
+                    const char *prompt_c_str,
+                    const char *negative_prompt_c_str,
                     int clip_skip,
                     float cfg_scale,
                     int width,
@@ -1314,7 +1665,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
     LOG_INFO("img2img %dx%d", width, height);
 
     std::vector<float> sigmas = sd_ctx->sd->denoiser->schedule->get_sigmas(sample_steps);
-    size_t t_enc              = static_cast<size_t>(sample_steps * strength);
+    size_t t_enc = static_cast<size_t>(sample_steps * strength);
     LOG_INFO("target t_enc is %zu steps", t_enc);
     std::vector<float> sigma_sched;
     sigma_sched.assign(sigmas.begin() + sample_steps - t_enc - 1, sigmas.end());
@@ -1323,26 +1674,26 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
     params.mem_size = static_cast<size_t>(10 * 1024) * 1024;  // 10 MB
     params.mem_size += width * height * 3 * sizeof(float) * 2;
     params.mem_buffer = NULL;
-    params.no_alloc   = false;
+    params.no_alloc = false;
     // LOG_DEBUG("mem_size %u ", params.mem_size);
 
     // draft context
-    struct ggml_context* work_ctx = ggml_init(params);
+    struct ggml_context *work_ctx = ggml_init(params);
     if (!work_ctx) {
         LOG_ERROR("ggml_init() failed");
         return NULL;
     }
 
     if (seed < 0) {
-        seed = (int)time(NULL);
+        seed = (int) time(NULL);
     }
 
     sd_ctx->sd->rng->manual_seed(seed);
 
     // extract and remove lora
-    auto result_pair                                = extract_and_remove_lora(prompt);
+    auto result_pair = extract_and_remove_lora(prompt);
     std::unordered_map<std::string, float> lora_f2m = result_pair.first;  // lora_name -> multiplier
-    for (auto& kv : lora_f2m) {
+    for (auto &kv: lora_f2m) {
         LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second);
     }
     prompt = result_pair.second;
@@ -1354,13 +1705,13 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
     int64_t t1 = ggml_time_ms();
     LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
 
-    ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
+    ggml_tensor *init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
     sd_image_to_tensor(init_image.data, init_img);
-    t0                       = ggml_time_ms();
-    ggml_tensor* init_latent = NULL;
+    t0 = ggml_time_ms();
+    ggml_tensor *init_latent = NULL;
     if (!sd_ctx->sd->use_tiny_autoencoder) {
-        ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
-        init_latent          = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
+        ggml_tensor *moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
+        init_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
     } else {
         init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
     }
@@ -1368,19 +1719,20 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
     t1 = ggml_time_ms();
     LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
 
-    auto cond_pair                = sd_ctx->sd->get_learned_condition(work_ctx, prompt, clip_skip, width, height);
-    ggml_tensor* c                = cond_pair.first;
-    ggml_tensor* c_vector         = cond_pair.second;  // [adm_in_channels, ]
-    struct ggml_tensor* uc        = NULL;
-    struct ggml_tensor* uc_vector = NULL;
+    auto cond_pair = sd_ctx->sd->get_learned_condition(work_ctx, prompt, clip_skip, width, height);
+    ggml_tensor *c = cond_pair.first;
+    ggml_tensor *c_vector = cond_pair.second;  // [adm_in_channels, ]
+    struct ggml_tensor *uc = NULL;
+    struct ggml_tensor *uc_vector = NULL;
     if (cfg_scale != 1.0) {
         bool force_zero_embeddings = false;
         if (sd_ctx->sd->version == VERSION_XL && negative_prompt.size() == 0) {
             force_zero_embeddings = true;
         }
-        auto uncond_pair = sd_ctx->sd->get_learned_condition(work_ctx, negative_prompt, clip_skip, width, height, force_zero_embeddings);
-        uc               = uncond_pair.first;
-        uc_vector        = uncond_pair.second;  // [adm_in_channels, ]
+        auto uncond_pair = sd_ctx->sd->get_learned_condition(work_ctx, negative_prompt, clip_skip, width, height,
+                                                             force_zero_embeddings);
+        uc = uncond_pair.first;
+        uc_vector = uncond_pair.second;  // [adm_in_channels, ]
     }
     int64_t t2 = ggml_time_ms();
     LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t2 - t1);
@@ -1389,11 +1741,11 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
     }
 
     sd_ctx->sd->rng->manual_seed(seed);
-    struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, init_latent);
+    struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, init_latent);
     ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng);
 
     LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
-    struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx, init_latent, noise, c, c_vector, uc, uc_vector,
+    struct ggml_tensor *x_0 = sd_ctx->sd->sample(work_ctx, init_latent, noise, c, c_vector, uc, uc_vector,
                                                  cfg_scale, sample_method, sigma_sched);
     // struct ggml_tensor *x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
     // print_ggml_tensor(x_0);
@@ -1403,7 +1755,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
         sd_ctx->sd->diffusion_model.free_params_buffer();
     }
 
-    struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, x_0);
+    struct ggml_tensor *img = sd_ctx->sd->decode_first_stage(work_ctx, x_0);
     if (sd_ctx->sd->free_params_immediately && !sd_ctx->sd->use_tiny_autoencoder) {
         sd_ctx->sd->first_stage_model.free_params_buffer();
     }
@@ -1412,17 +1764,17 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
         return NULL;
     }
 
-    sd_image_t* result_images = (sd_image_t*)calloc(1, sizeof(sd_image_t));
+    sd_image_t *result_images = (sd_image_t *) calloc(1, sizeof(sd_image_t));
     if (result_images == NULL) {
         ggml_free(work_ctx);
         return NULL;
     }
 
     for (size_t i = 0; i < 1; i++) {
-        result_images[i].width   = width;
-        result_images[i].height  = height;
+        result_images[i].width = width;
+        result_images[i].height = height;
         result_images[i].channel = 3;
-        result_images[i].data    = sd_tensor_to_image(img);
+        result_images[i].data = sd_tensor_to_image(img);
     }
     ggml_free(work_ctx);
 
diff --git a/stable-diffusion.h b/stable-diffusion.h
index a18ee4a3..0d59dce4 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -54,8 +54,8 @@ enum schedule_t {
 
 // same as enum ggml_type
 enum sd_type_t {
-    SD_TYPE_F32  = 0,
-    SD_TYPE_F16  = 1,
+    SD_TYPE_F32 = 0,
+    SD_TYPE_F16 = 1,
     SD_TYPE_Q4_0 = 2,
     SD_TYPE_Q4_1 = 3,
     // SD_TYPE_Q4_2 = 4, support has been removed
@@ -78,7 +78,7 @@ enum sd_type_t {
     SD_TYPE_COUNT,
 };
 
-SD_API const char* sd_type_name(enum sd_type_t type);
+SD_API const char *sd_type_name(enum sd_type_t type);
 
 enum sd_log_level_t {
     SD_LOG_DEBUG,
@@ -87,38 +87,36 @@ enum sd_log_level_t {
     SD_LOG_ERROR
 };
 
-typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
+typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char *text, void *data);
 
-SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
+SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void *data);
 SD_API int32_t get_num_physical_cores();
-SD_API const char* sd_get_system_info();
+SD_API const char *sd_get_system_info();
 
 typedef struct {
     uint32_t width;
     uint32_t height;
     uint32_t channel;
-    uint8_t* data;
+    uint8_t *data;
 } sd_image_t;
 
 typedef struct sd_ctx_t sd_ctx_t;
 
-SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
-                            const char* vae_path,
-                            const char* taesd_path,
-                            const char* lora_model_dir,
+SD_API sd_ctx_t *new_sd_ctx(int n_threads,
                             bool vae_decode_only,
-                            bool vae_tiling,
                             bool free_params_immediately,
-                            int n_threads,
-                            enum sd_type_t wtype,
+                            const char *lora_model_dir_c_str,
                             enum rng_type_t rng_type,
-                            enum schedule_t s);
+                            bool vae_tiling,
+                            enum sd_type_t wtype,
+                            enum schedule_t s,
+                            bool init_backend_immediately = true);
 
-SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
+SD_API void free_sd_ctx(sd_ctx_t *sd_ctx);
 
-SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
-                           const char* prompt,
-                           const char* negative_prompt,
+SD_API sd_image_t *txt2img(sd_ctx_t *sd_ctx,
+                           const char *prompt,
+                           const char *negative_prompt,
                            int clip_skip,
                            float cfg_scale,
                            int width,
@@ -128,10 +126,10 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                            int64_t seed,
                            int batch_count);
 
-SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
+SD_API sd_image_t *img2img(sd_ctx_t *sd_ctx,
                            sd_image_t init_image,
-                           const char* prompt,
-                           const char* negative_prompt,
+                           const char *prompt,
+                           const char *negative_prompt,
                            int clip_skip,
                            float cfg_scale,
                            int width,
@@ -144,14 +142,46 @@ SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
 
 typedef struct upscaler_ctx_t upscaler_ctx_t;
 
-SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
+SD_API upscaler_ctx_t *new_upscaler_ctx(const char *esrgan_path,
                                         int n_threads,
                                         enum sd_type_t wtype);
-SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
+SD_API void free_upscaler_ctx(upscaler_ctx_t *upscaler_ctx);
+
+SD_API sd_image_t upscale(upscaler_ctx_t *upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor);
+
+SD_API void init_backend(sd_ctx_t *sd_ctx);
+
+SD_API void set_options(sd_ctx_t *sd_ctx,
+                        int n_threads,
+                        bool vae_decode_only,
+                        bool free_params_immediately,
+                        const char *lora_model_dir,
+                        rng_type_t rng_type,
+                        bool vae_tiling,
+                        sd_type_t wtype,
+                        schedule_t schedule);
+
+SD_API bool load_clip_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix = "te.");
+
+SD_API void free_clip_params(sd_ctx_t *sd_ctx);
+
+SD_API bool load_unet_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix = "unet.");
+
+SD_API void free_unet_params(sd_ctx_t *sd_ctx);
+
+SD_API bool load_vae_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix = "vae.");
+
+SD_API void free_vae_params(sd_ctx_t *sd_ctx);
+
+SD_API bool load_taesd_from_file(sd_ctx_t *sd_ctx, const char *model_path);
+
+SD_API void free_taesd_params(sd_ctx_t *sd_ctx);
+
+SD_API bool load_diffusions_from_file(sd_ctx_t *sd_ctx, const char *model_path);
 
-SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor);
+SD_API void free_diffusions_params(sd_ctx_t *sd_ctx);
 
-SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type);
+SD_API bool convert(const char *input_path, const char *vae_path, const char *output_path, sd_type_t output_type);
 
 #ifdef __cplusplus
 }

From c7d11b9ba031f6dcd69ea49a28f77d7529e4336e Mon Sep 17 00:00:00 2001
From: Cyberhan123 <255542417@qq.com>
Date: Tue, 23 Jan 2024 19:08:49 +0800
Subject: [PATCH 2/8] format code

---
 clip.hpp              |  11 +-
 esrgan.hpp            |   2 +-
 examples/cli/main.cpp | 133 ++++----
 model.cpp             | 484 ++++++++++++++--------------
 model.h               |   3 +-
 stable-diffusion.cpp  | 734 +++++++++++++++++++++---------------------
 stable-diffusion.h    |  78 ++---
 unet.hpp              |  18 +-
 8 files changed, 720 insertions(+), 743 deletions(-)

diff --git a/clip.hpp b/clip.hpp
index a456fffc..742cce09 100644
--- a/clip.hpp
+++ b/clip.hpp
@@ -475,7 +475,6 @@ struct ResidualAttentionBlock {
 
         ln2_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
         ln2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
-
     }
 
     void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
@@ -822,11 +821,11 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
 
             auto hidden_states2 = text_model2.forward(ctx0, input_ids2);  // [N, n_token, hidden_size2]
             hidden_states2      = ggml_reshape_4d(ctx0,
-                                             hidden_states2,
-                                             hidden_states2->ne[0],
-                                             hidden_states2->ne[1],
-                                             hidden_states2->ne[2],
-                                             hidden_states2->ne[3]);
+                                                  hidden_states2,
+                                                  hidden_states2->ne[0],
+                                                  hidden_states2->ne[1],
+                                                  hidden_states2->ne[2],
+                                                  hidden_states2->ne[3]);
             hidden_states2      = ggml_cont(ctx0, ggml_permute(ctx0, hidden_states2, 2, 0, 1, 3));
 
             hidden_states = ggml_concat(ctx0, hidden_states, hidden_states2);  // [N, n_token, hidden_size + hidden_size2]
diff --git a/esrgan.hpp b/esrgan.hpp
index 90194c0d..c86363f7 100644
--- a/esrgan.hpp
+++ b/esrgan.hpp
@@ -376,7 +376,7 @@ struct ESRGAN : public GGMLModule {
         struct ggml_cgraph* gf = ggml_new_graph(ctx0);
 
         struct ggml_tensor* x_ = NULL;
-        float out_scale = 0.2f;
+        float out_scale        = 0.2f;
 
         // it's performing a compute, check if backend isn't cpu
         if (!ggml_backend_is_cpu(backend)) {
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index bde19f34..b08340b3 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -17,34 +17,34 @@
 
 #include "stb_image_write.h"
 
-const char *rng_type_to_str[] = {
-        "std_default",
-        "cuda",
+const char* rng_type_to_str[] = {
+    "std_default",
+    "cuda",
 };
 
 // Names of the sampler method, same order as enum sample_method in stable-diffusion.h
-const char *sample_method_str[] = {
-        "euler_a",
-        "euler",
-        "heun",
-        "dpm2",
-        "dpm++2s_a",
-        "dpm++2m",
-        "dpm++2mv2",
-        "lcm",
+const char* sample_method_str[] = {
+    "euler_a",
+    "euler",
+    "heun",
+    "dpm2",
+    "dpm++2s_a",
+    "dpm++2m",
+    "dpm++2mv2",
+    "lcm",
 };
 
 // Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
-const char *schedule_str[] = {
-        "default",
-        "discrete",
-        "karras",
+const char* schedule_str[] = {
+    "default",
+    "discrete",
+    "karras",
 };
 
-const char *modes_str[] = {
-        "txt2img",
-        "img2img",
-        "convert",
+const char* modes_str[] = {
+    "txt2img",
+    "img2img",
+    "convert",
 };
 
 enum SDMode {
@@ -56,7 +56,7 @@ enum SDMode {
 
 struct SDParams {
     int n_threads = -1;
-    SDMode mode = TXT2IMG;
+    SDMode mode   = TXT2IMG;
 
     std::string model_path;
     std::string vae_path;
@@ -70,22 +70,22 @@ struct SDParams {
     std::string prompt;
     std::string negative_prompt;
     float cfg_scale = 7.0f;
-    int clip_skip = -1;  // <= 0 represents unspecified
-    int width = 512;
-    int height = 512;
+    int clip_skip   = -1;  // <= 0 represents unspecified
+    int width       = 512;
+    int height      = 512;
     int batch_count = 1;
 
     sample_method_t sample_method = EULER_A;
-    schedule_t schedule = DEFAULT;
-    int sample_steps = 20;
-    float strength = 0.75f;
-    rng_type_t rng_type = CUDA_RNG;
-    int64_t seed = 42;
-    bool verbose = false;
-    bool vae_tiling = false;
+    schedule_t schedule           = DEFAULT;
+    int sample_steps              = 20;
+    float strength                = 0.75f;
+    rng_type_t rng_type           = CUDA_RNG;
+    int64_t seed                  = 42;
+    bool verbose                  = false;
+    bool vae_tiling               = false;
 };
 
-static std::string sd_basename(const std::string &path) {
+static std::string sd_basename(const std::string& path) {
     size_t pos = path.find_last_of('/');
     if (pos != std::string::npos) {
         return path.substr(pos + 1);
@@ -124,7 +124,7 @@ void print_params(SDParams params) {
     printf("    vae_tiling:        %s\n", params.vae_tiling ? "true" : "false");
 }
 
-void print_usage(int argc, const char *argv[]) {
+void print_usage(int argc, const char* argv[]) {
     printf("usage: %s [arguments]\n", argv[0]);
     printf("\n");
     printf("arguments:\n");
@@ -161,7 +161,7 @@ void print_usage(int argc, const char *argv[]) {
     printf("  -v, --verbose                      print extra info\n");
 }
 
-void parse_args(int argc, const char **argv, SDParams &params) {
+void parse_args(int argc, const char** argv, SDParams& params) {
     bool invalid_arg = false;
     std::string arg;
     for (int i = 1; i < argc; i++) {
@@ -178,8 +178,8 @@ void parse_args(int argc, const char **argv, SDParams &params) {
                 invalid_arg = true;
                 break;
             }
-            const char *mode_selected = argv[i];
-            int mode_found = -1;
+            const char* mode_selected = argv[i];
+            int mode_found            = -1;
             for (int d = 0; d < MODE_COUNT; d++) {
                 if (!strcmp(mode_selected, modes_str[d])) {
                     mode_found = d;
@@ -190,7 +190,7 @@ void parse_args(int argc, const char **argv, SDParams &params) {
                         mode_selected);
                 exit(1);
             }
-            params.mode = (SDMode) mode_found;
+            params.mode = (SDMode)mode_found;
         } else if (arg == "-m" || arg == "--model") {
             if (++i >= argc) {
                 invalid_arg = true;
@@ -334,8 +334,8 @@ void parse_args(int argc, const char **argv, SDParams &params) {
                 invalid_arg = true;
                 break;
             }
-            const char *schedule_selected = argv[i];
-            int schedule_found = -1;
+            const char* schedule_selected = argv[i];
+            int schedule_found            = -1;
             for (int d = 0; d < N_SCHEDULES; d++) {
                 if (!strcmp(schedule_selected, schedule_str[d])) {
                     schedule_found = d;
@@ -345,7 +345,7 @@ void parse_args(int argc, const char **argv, SDParams &params) {
                 invalid_arg = true;
                 break;
             }
-            params.schedule = (schedule_t) schedule_found;
+            params.schedule = (schedule_t)schedule_found;
         } else if (arg == "-s" || arg == "--seed") {
             if (++i >= argc) {
                 invalid_arg = true;
@@ -357,8 +357,8 @@ void parse_args(int argc, const char **argv, SDParams &params) {
                 invalid_arg = true;
                 break;
             }
-            const char *sample_method_selected = argv[i];
-            int sample_method_found = -1;
+            const char* sample_method_selected = argv[i];
+            int sample_method_found            = -1;
             for (int m = 0; m < N_SAMPLE_METHODS; m++) {
                 if (!strcmp(sample_method_selected, sample_method_str[m])) {
                     sample_method_found = m;
@@ -368,7 +368,7 @@ void parse_args(int argc, const char **argv, SDParams &params) {
                 invalid_arg = true;
                 break;
             }
-            params.sample_method = (sample_method_t) sample_method_found;
+            params.sample_method = (sample_method_t)sample_method_found;
         } else if (arg == "-h" || arg == "--help") {
             print_usage(argc, argv);
             exit(0);
@@ -434,7 +434,7 @@ void parse_args(int argc, const char **argv, SDParams &params) {
     }
 
     if (params.seed < 0) {
-        srand((int) time(NULL));
+        srand((int)time(NULL));
         params.seed = rand();
     }
 
@@ -465,8 +465,8 @@ std::string get_image_params(SDParams params, int64_t seed) {
     return parameter_string;
 }
 
-void sd_log_cb(enum sd_log_level_t level, const char *log, void *data) {
-    SDParams *params = (SDParams *) data;
+void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
+    SDParams* params = (SDParams*)data;
     if (!params->verbose && level <= SD_LOG_DEBUG) {
         return;
     }
@@ -479,11 +479,11 @@ void sd_log_cb(enum sd_log_level_t level, const char *log, void *data) {
     }
 }
 
-int main(int argc, const char *argv[]) {
+int main(int argc, const char* argv[]) {
     SDParams params;
     parse_args(argc, argv, params);
 
-    sd_set_log_callback(sd_log_cb, (void *) &params);
+    sd_set_log_callback(sd_log_cb, (void*)&params);
 
     if (params.verbose) {
         print_params(params);
@@ -511,12 +511,12 @@ int main(int argc, const char *argv[]) {
         }
     }
 
-    bool vae_decode_only = true;
-    uint8_t *input_image_buffer = NULL;
+    bool vae_decode_only        = true;
+    uint8_t* input_image_buffer = NULL;
     if (params.mode == IMG2IMG) {
         vae_decode_only = false;
 
-        int c = 0;
+        int c              = 0;
         input_image_buffer = stbi_load(params.input_path.c_str(), &params.width, &params.height, &c, 3);
         if (input_image_buffer == NULL) {
             fprintf(stderr, "load image from '%s' failed\n", params.input_path.c_str());
@@ -539,17 +539,16 @@ int main(int argc, const char *argv[]) {
         }
     }
 
-    sd_ctx_t *sd_ctx = new_sd_ctx(
-            params.n_threads,
-            vae_decode_only,
-            true,
-            params.lora_model_dir.c_str(),
-            params.rng_type,
-            params.vae_tiling,
-            params.wtype,
-            params.schedule,
-            true
-    );
+    sd_ctx_t* sd_ctx = new_sd_ctx(
+        params.n_threads,
+        vae_decode_only,
+        true,
+        params.lora_model_dir.c_str(),
+        params.rng_type,
+        params.vae_tiling,
+        params.wtype,
+        params.schedule,
+        true);
 
     if (sd_ctx == NULL) {
         printf("new_sd_ctx_t failed\n");
@@ -577,7 +576,7 @@ int main(int argc, const char *argv[]) {
         }
     }
 
-    sd_image_t *results;
+    sd_image_t* results;
     if (params.mode == TXT2IMG) {
         results = txt2img(sd_ctx,
                           params.prompt.c_str(),
@@ -591,8 +590,8 @@ int main(int argc, const char *argv[]) {
                           params.seed,
                           params.batch_count);
     } else {
-        sd_image_t input_image = {(uint32_t) params.width,
-                                  (uint32_t) params.height,
+        sd_image_t input_image = {(uint32_t)params.width,
+                                  (uint32_t)params.height,
                                   3,
                                   input_image_buffer};
 
@@ -619,7 +618,7 @@ int main(int argc, const char *argv[]) {
 
     int upscale_factor = 4;  // unused for RealESRGAN_x4plus_anime_6B.pth
     if (params.esrgan_path.size() > 0) {
-        upscaler_ctx_t *upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(),
+        upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(),
                                                         params.n_threads,
                                                         params.wtype);
 
@@ -641,7 +640,7 @@ int main(int argc, const char *argv[]) {
         }
     }
 
-    size_t last = params.output_path.find_last_of(".");
+    size_t last            = params.output_path.find_last_of(".");
     std::string dummy_name = last != std::string::npos ? params.output_path.substr(0, last) : params.output_path;
     for (int i = 0; i < params.batch_count; i++) {
         if (results[i].data == NULL) {
diff --git a/model.cpp b/model.cpp
index 60f4dda7..f4130e5c 100644
--- a/model.cpp
+++ b/model.cpp
@@ -23,7 +23,7 @@
 
 #define ST_HEADER_SIZE_LEN 8
 
-uint64_t read_u64(uint8_t *buffer) {
+uint64_t read_u64(uint8_t* buffer) {
     // little endian
     uint64_t value = 0;
     value |= static_cast<int64_t>(buffer[7]) << 56;
@@ -37,7 +37,7 @@ uint64_t read_u64(uint8_t *buffer) {
     return value;
 }
 
-int32_t read_int(uint8_t *buffer) {
+int32_t read_int(uint8_t* buffer) {
     // little endian
     int value = 0;
     value |= buffer[3] << 24;
@@ -47,7 +47,7 @@ int32_t read_int(uint8_t *buffer) {
     return value;
 }
 
-uint16_t read_short(uint8_t *buffer) {
+uint16_t read_short(uint8_t* buffer) {
     // little endian
     uint16_t value = 0;
     value |= buffer[1] << 8;
@@ -58,44 +58,44 @@ uint16_t read_short(uint8_t *buffer) {
 /*================================================= Preprocess ==================================================*/
 
 std::string self_attn_names[] = {
-        "self_attn.q_proj.weight",
-        "self_attn.k_proj.weight",
-        "self_attn.v_proj.weight",
-        "self_attn.q_proj.bias",
-        "self_attn.k_proj.bias",
-        "self_attn.v_proj.bias",
+    "self_attn.q_proj.weight",
+    "self_attn.k_proj.weight",
+    "self_attn.v_proj.weight",
+    "self_attn.q_proj.bias",
+    "self_attn.k_proj.bias",
+    "self_attn.v_proj.bias",
 };
 
-const char *unused_tensors[] = {
-        "betas",
-        "alphas_cumprod_prev",
-        "sqrt_alphas_cumprod",
-        "sqrt_one_minus_alphas_cumprod",
-        "log_one_minus_alphas_cumprod",
-        "sqrt_recip_alphas_cumprod",
-        "sqrt_recipm1_alphas_cumprod",
-        "posterior_variance",
-        "posterior_log_variance_clipped",
-        "posterior_mean_coef1",
-        "posterior_mean_coef2",
-        "cond_stage_model.transformer.text_model.embeddings.position_ids",
-        "cond_stage_model.model.logit_scale",
-        "cond_stage_model.model.text_projection",
-        "conditioner.embedders.0.transformer.text_model.embeddings.position_ids",
-        "conditioner.embedders.0.model.logit_scale",
-        "conditioner.embedders.1.model.logit_scale",
-        "model.diffusion_model.time_embedding.cond_proj.weight",
-        "unet.time_embedding.cond_proj.weight",
-        "model_ema.decay",
-        "model_ema.num_updates",
-        "model_ema.diffusion_model",
-        "control_model",
-        "embedding_manager",
-        "denoiser.sigmas",
+const char* unused_tensors[] = {
+    "betas",
+    "alphas_cumprod_prev",
+    "sqrt_alphas_cumprod",
+    "sqrt_one_minus_alphas_cumprod",
+    "log_one_minus_alphas_cumprod",
+    "sqrt_recip_alphas_cumprod",
+    "sqrt_recipm1_alphas_cumprod",
+    "posterior_variance",
+    "posterior_log_variance_clipped",
+    "posterior_mean_coef1",
+    "posterior_mean_coef2",
+    "cond_stage_model.transformer.text_model.embeddings.position_ids",
+    "cond_stage_model.model.logit_scale",
+    "cond_stage_model.model.text_projection",
+    "conditioner.embedders.0.transformer.text_model.embeddings.position_ids",
+    "conditioner.embedders.0.model.logit_scale",
+    "conditioner.embedders.1.model.logit_scale",
+    "model.diffusion_model.time_embedding.cond_proj.weight",
+    "unet.time_embedding.cond_proj.weight",
+    "model_ema.decay",
+    "model_ema.num_updates",
+    "model_ema.diffusion_model",
+    "control_model",
+    "embedding_manager",
+    "denoiser.sigmas",
 };
 
 bool is_unused_tensor(std::string name) {
-    for (int i = 0; i < sizeof(unused_tensors) / sizeof(const char *); i++) {
+    for (int i = 0; i < sizeof(unused_tensors) / sizeof(const char*); i++) {
         if (starts_with(name, unused_tensors[i])) {
             return true;
         }
@@ -104,54 +104,54 @@ bool is_unused_tensor(std::string name) {
 }
 
 std::unordered_map<std::string, std::string> open_clip_to_hf_clip_model = {
-        {"model.ln_final.bias",          "transformer.text_model.final_layer_norm.bias"},
-        {"model.ln_final.weight",        "transformer.text_model.final_layer_norm.weight"},
-        {"model.positional_embedding",   "transformer.text_model.embeddings.position_embedding.weight"},
-        {"model.token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"},
-        {"model.text_projection",        "transformer.text_model.text_projection"},
+    {"model.ln_final.bias", "transformer.text_model.final_layer_norm.bias"},
+    {"model.ln_final.weight", "transformer.text_model.final_layer_norm.weight"},
+    {"model.positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"},
+    {"model.token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"},
+    {"model.text_projection", "transformer.text_model.text_projection"},
 };
 
 std::unordered_map<std::string, std::string> open_clip_to_hk_clip_resblock = {
-        {"attn.out_proj.bias",   "self_attn.out_proj.bias"},
-        {"attn.out_proj.weight", "self_attn.out_proj.weight"},
-        {"ln_1.bias",            "layer_norm1.bias"},
-        {"ln_1.weight",          "layer_norm1.weight"},
-        {"ln_2.bias",            "layer_norm2.bias"},
-        {"ln_2.weight",          "layer_norm2.weight"},
-        {"mlp.c_fc.bias",        "mlp.fc1.bias"},
-        {"mlp.c_fc.weight",      "mlp.fc1.weight"},
-        {"mlp.c_proj.bias",      "mlp.fc2.bias"},
-        {"mlp.c_proj.weight",    "mlp.fc2.weight"},
+    {"attn.out_proj.bias", "self_attn.out_proj.bias"},
+    {"attn.out_proj.weight", "self_attn.out_proj.weight"},
+    {"ln_1.bias", "layer_norm1.bias"},
+    {"ln_1.weight", "layer_norm1.weight"},
+    {"ln_2.bias", "layer_norm2.bias"},
+    {"ln_2.weight", "layer_norm2.weight"},
+    {"mlp.c_fc.bias", "mlp.fc1.bias"},
+    {"mlp.c_fc.weight", "mlp.fc1.weight"},
+    {"mlp.c_proj.bias", "mlp.fc2.bias"},
+    {"mlp.c_proj.weight", "mlp.fc2.weight"},
 };
 
 std::unordered_map<std::string, std::string> vae_decoder_name_map = {
-        {"first_stage_model.decoder.mid.attn_1.to_k.bias",       "first_stage_model.decoder.mid.attn_1.k.bias"},
-        {"first_stage_model.decoder.mid.attn_1.to_k.weight",     "first_stage_model.decoder.mid.attn_1.k.weight"},
-        {"first_stage_model.decoder.mid.attn_1.to_out.0.bias",   "first_stage_model.decoder.mid.attn_1.proj_out.bias"},
-        {"first_stage_model.decoder.mid.attn_1.to_out.0.weight", "first_stage_model.decoder.mid.attn_1.proj_out.weight"},
-        {"first_stage_model.decoder.mid.attn_1.to_q.bias",       "first_stage_model.decoder.mid.attn_1.q.bias"},
-        {"first_stage_model.decoder.mid.attn_1.to_q.weight",     "first_stage_model.decoder.mid.attn_1.q.weight"},
-        {"first_stage_model.decoder.mid.attn_1.to_v.bias",       "first_stage_model.decoder.mid.attn_1.v.bias"},
-        {"first_stage_model.decoder.mid.attn_1.to_v.weight",     "first_stage_model.decoder.mid.attn_1.v.weight"},
+    {"first_stage_model.decoder.mid.attn_1.to_k.bias", "first_stage_model.decoder.mid.attn_1.k.bias"},
+    {"first_stage_model.decoder.mid.attn_1.to_k.weight", "first_stage_model.decoder.mid.attn_1.k.weight"},
+    {"first_stage_model.decoder.mid.attn_1.to_out.0.bias", "first_stage_model.decoder.mid.attn_1.proj_out.bias"},
+    {"first_stage_model.decoder.mid.attn_1.to_out.0.weight", "first_stage_model.decoder.mid.attn_1.proj_out.weight"},
+    {"first_stage_model.decoder.mid.attn_1.to_q.bias", "first_stage_model.decoder.mid.attn_1.q.bias"},
+    {"first_stage_model.decoder.mid.attn_1.to_q.weight", "first_stage_model.decoder.mid.attn_1.q.weight"},
+    {"first_stage_model.decoder.mid.attn_1.to_v.bias", "first_stage_model.decoder.mid.attn_1.v.bias"},
+    {"first_stage_model.decoder.mid.attn_1.to_v.weight", "first_stage_model.decoder.mid.attn_1.v.weight"},
 };
 
-std::string convert_open_clip_to_hf_clip(const std::string &name) {
+std::string convert_open_clip_to_hf_clip(const std::string& name) {
     std::string new_name = name;
     std::string prefix;
     if (starts_with(new_name, "conditioner.embedders.0.")) {
-        prefix = "cond_stage_model.";
+        prefix   = "cond_stage_model.";
         new_name = new_name.substr(strlen("conditioner.embedders.0."));
     } else if (starts_with(new_name, "conditioner.embedders.1.")) {
-        prefix = "cond_stage_model.1.";
+        prefix   = "cond_stage_model.1.";
         new_name = new_name.substr(strlen("conditioner.embedders.0."));
     } else if (starts_with(new_name, "cond_stage_model.")) {
-        prefix = "cond_stage_model.";
+        prefix   = "cond_stage_model.";
         new_name = new_name.substr(strlen("cond_stage_model."));
     } else {
         return new_name;
     }
     std::string open_clip_resblock_prefix = "model.transformer.resblocks.";
-    std::string hf_clip_resblock_prefix = "transformer.text_model.encoder.layers.";
+    std::string hf_clip_resblock_prefix   = "transformer.text_model.encoder.layers.";
 
     if (open_clip_to_hf_clip_model.find(new_name) != open_clip_to_hf_clip_model.end()) {
         new_name = open_clip_to_hf_clip_model[new_name];
@@ -159,21 +159,21 @@ std::string convert_open_clip_to_hf_clip(const std::string &name) {
 
     if (new_name.find(open_clip_resblock_prefix) == 0) {
         std::string remain = new_name.substr(open_clip_resblock_prefix.length());
-        std::string idx = remain.substr(0, remain.find("."));
+        std::string idx    = remain.substr(0, remain.find("."));
         std::string suffix = remain.substr(idx.length() + 1);
 
         if (suffix == "attn.in_proj_weight" || suffix == "attn.in_proj_bias") {
             new_name = hf_clip_resblock_prefix + idx + "." + suffix;
         } else if (open_clip_to_hk_clip_resblock.find(suffix) != open_clip_to_hk_clip_resblock.end()) {
             std::string new_suffix = open_clip_to_hk_clip_resblock[suffix];
-            new_name = hf_clip_resblock_prefix + idx + "." + new_suffix;
+            new_name               = hf_clip_resblock_prefix + idx + "." + new_suffix;
         }
     }
 
     return prefix + new_name;
 }
 
-std::string convert_vae_decoder_name(const std::string &name) {
+std::string convert_vae_decoder_name(const std::string& name) {
     if (vae_decoder_name_map.find(name) != vae_decoder_name_map.end()) {
         return vae_decoder_name_map[name];
     }
@@ -181,57 +181,57 @@ std::string convert_vae_decoder_name(const std::string &name) {
 }
 
 std::unordered_map<std::string, std::unordered_map<std::string, std::string>> suffix_conversion_underline = {
+    {
+        "attentions",
         {
-                "attentions",
-                {
-                        {"to_k",  "k"},
-                        {"to_q",  "q"},
-                        {"to_v",  "v"},
-                        {"to_out_0", "proj_out"},
-                        {"group_norm",    "norm"},
-                },
+            {"to_k", "k"},
+            {"to_q", "q"},
+            {"to_v", "v"},
+            {"to_out_0", "proj_out"},
+            {"group_norm", "norm"},
         },
+    },
+    {
+        "resnets",
         {
-                "resnets",
-                {
-                        {"conv1", "in_layers_2"},
-                        {"conv2", "out_layers_3"},
-                        {"norm1", "in_layers_0"},
-                        {"norm2",    "out_layers_0"},
-                        {"time_emb_proj", "emb_layers_1"},
-                        {"conv_shortcut", "skip_connection"},
-                },
+            {"conv1", "in_layers_2"},
+            {"conv2", "out_layers_3"},
+            {"norm1", "in_layers_0"},
+            {"norm2", "out_layers_0"},
+            {"time_emb_proj", "emb_layers_1"},
+            {"conv_shortcut", "skip_connection"},
         },
+    },
 };
 
 std::unordered_map<std::string, std::unordered_map<std::string, std::string>> suffix_conversion_dot = {
+    {
+        "attentions",
         {
-                "attentions",
-                {
-                        {"to_k",  "k"},
-                        {"to_q",  "q"},
-                        {"to_v",  "v"},
-                        {"to_out.0", "proj_out"},
-                        {"group_norm",    "norm"},
-                },
+            {"to_k", "k"},
+            {"to_q", "q"},
+            {"to_v", "v"},
+            {"to_out.0", "proj_out"},
+            {"group_norm", "norm"},
         },
+    },
+    {
+        "resnets",
         {
-                "resnets",
-                {
-                        {"conv1", "in_layers.2"},
-                        {"conv2", "out_layers.3"},
-                        {"norm1", "in_layers.0"},
-                        {"norm2",    "out_layers.0"},
-                        {"time_emb_proj", "emb_layers.1"},
-                        {"conv_shortcut", "skip_connection"},
-                },
+            {"conv1", "in_layers.2"},
+            {"conv2", "out_layers.3"},
+            {"norm1", "in_layers.0"},
+            {"norm2", "out_layers.0"},
+            {"time_emb_proj", "emb_layers.1"},
+            {"conv_shortcut", "skip_connection"},
         },
+    },
 };
 
-std::string convert_diffusers_name_to_compvis(const std::string &key, char seq) {
+std::string convert_diffusers_name_to_compvis(const std::string& key, char seq) {
     std::vector<std::string> m;
 
-    auto match = [](std::vector<std::string> &match_list, const std::regex &regex, const std::string &key) {
+    auto match = [](std::vector<std::string>& match_list, const std::regex& regex, const std::string& key) {
         auto r = std::smatch{};
         if (!std::regex_match(key, r, regex)) {
             return false;
@@ -251,7 +251,7 @@ std::string convert_diffusers_name_to_compvis(const std::string &key, char seq)
         suffix_conversion = suffix_conversion_dot;
     }
 
-    auto get_converted_suffix = [&suffix_conversion](const std::string &outer_key, const std::string &inner_key) {
+    auto get_converted_suffix = [&suffix_conversion](const std::string& outer_key, const std::string& inner_key) {
         auto outer_iter = suffix_conversion.find(outer_key);
         if (outer_iter != suffix_conversion.end()) {
             auto inner_iter = outer_iter->second.find(inner_key);
@@ -280,8 +280,7 @@ std::string convert_diffusers_name_to_compvis(const std::string &key, char seq)
                m[1];
     }
 
-    if (match(m, std::regex(
-            format("unet%cdown_blocks%c(\\d+)%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) {
+    if (match(m, std::regex(format("unet%cdown_blocks%c(\\d+)%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) {
         std::string suffix = get_converted_suffix(m[1], m[3]);
         // LOG_DEBUG("%s %s %s %s", m[0].c_str(), m[1].c_str(), m[2].c_str(), m[3].c_str());
         return format("model%cdiffusion_model%cinput_blocks%c", seq, seq, seq) +
@@ -296,8 +295,7 @@ std::string convert_diffusers_name_to_compvis(const std::string &key, char seq)
                seq + suffix;
     }
 
-    if (match(m, std::regex(
-            format("unet%cup_blocks%c(\\d+)%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) {
+    if (match(m, std::regex(format("unet%cup_blocks%c(\\d+)%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) {
         std::string suffix = get_converted_suffix(m[1], m[3]);
         return format("model%cdiffusion_model%coutput_blocks%c", seq, seq, seq) +
                std::to_string(std::stoi(m[0]) * 3 + std::stoi(m[2])) + seq +
@@ -338,10 +336,10 @@ std::string convert_diffusers_name_to_compvis(const std::string &key, char seq)
         std::string block_name;
         if (m[1] == "attentions") {
             block_name = "attn";
-            suffix = get_converted_suffix(m[1], m[3]);
+            suffix     = get_converted_suffix(m[1], m[3]);
         } else {
             block_name = "block";
-            suffix = m[3];
+            suffix     = m[3];
         }
         return format("first_stage_model%c%s%cmid%c%s_%d%c%s",
                       seq, m[0].c_str(), seq, seq, block_name.c_str(), std::stoi(m[2]) + 1, seq, suffix.c_str());
@@ -389,7 +387,7 @@ std::string convert_diffusers_name_to_compvis(const std::string &key, char seq)
     return key;
 }
 
-std::string convert_tensor_name(const std::string &name) {
+std::string convert_tensor_name(const std::string& name) {
     std::string new_name;
     if (starts_with(name, "cond_stage_model.") || starts_with(name, "conditioner.embedders.")) {
         new_name = convert_open_clip_to_hf_clip(name);
@@ -399,7 +397,7 @@ std::string convert_tensor_name(const std::string &name) {
         size_t pos = name.find('.');
         if (pos != std::string::npos) {
             std::string name_without_network_parts = name.substr(5, pos - 5);
-            std::string network_part = name.substr(pos + 1);
+            std::string network_part               = name.substr(pos + 1);
             // LOG_DEBUG("%s %s", name_without_network_parts.c_str(), network_part.c_str());
             std::string new_key = convert_diffusers_name_to_compvis(name_without_network_parts, '_');
             if (new_key.empty()) {
@@ -414,7 +412,7 @@ std::string convert_tensor_name(const std::string &name) {
         size_t pos = name.find_last_of('.');
         if (pos != std::string::npos) {
             std::string name_without_network_parts = name.substr(0, pos);
-            std::string network_part = name.substr(pos + 1);
+            std::string network_part               = name.substr(pos + 1);
             // LOG_DEBUG("%s %s", name_without_network_parts.c_str(), network_part.c_str());
             std::string new_key = convert_diffusers_name_to_compvis(name_without_network_parts, '.');
             if (new_key.empty()) {
@@ -435,7 +433,7 @@ std::string convert_tensor_name(const std::string &name) {
 }
 
 void preprocess_tensor(TensorStorage tensor_storage,
-                       std::vector<TensorStorage> &processed_tensor_storages) {
+                       std::vector<TensorStorage>& processed_tensor_storages) {
     std::vector<TensorStorage> result;
     std::string new_name = convert_tensor_name(tensor_storage.name);
 
@@ -458,9 +456,9 @@ void preprocess_tensor(TensorStorage tensor_storage,
         std::string prefix = new_name.substr(0, prefix_size);
 
         std::vector<TensorStorage> chunks = tensor_storage.chunk(3);
-        chunks[0].name = prefix + "self_attn.q_proj.weight";
-        chunks[1].name = prefix + "self_attn.k_proj.weight";
-        chunks[2].name = prefix + "self_attn.v_proj.weight";
+        chunks[0].name                    = prefix + "self_attn.q_proj.weight";
+        chunks[1].name                    = prefix + "self_attn.k_proj.weight";
+        chunks[2].name                    = prefix + "self_attn.v_proj.weight";
 
         processed_tensor_storages.insert(processed_tensor_storages.end(), chunks.begin(), chunks.end());
 
@@ -470,9 +468,9 @@ void preprocess_tensor(TensorStorage tensor_storage,
         std::string prefix = new_name.substr(0, prefix_size);
 
         std::vector<TensorStorage> chunks = tensor_storage.chunk(3);
-        chunks[0].name = prefix + "self_attn.q_proj.bias";
-        chunks[1].name = prefix + "self_attn.k_proj.bias";
-        chunks[2].name = prefix + "self_attn.v_proj.bias";
+        chunks[0].name                    = prefix + "self_attn.q_proj.bias";
+        chunks[1].name                    = prefix + "self_attn.k_proj.bias";
+        chunks[2].name                    = prefix + "self_attn.v_proj.bias";
 
         processed_tensor_storages.insert(processed_tensor_storages.end(), chunks.begin(), chunks.end());
     } else {
@@ -482,38 +480,38 @@ void preprocess_tensor(TensorStorage tensor_storage,
 
 float bf16_to_f32(uint16_t bfloat16) {
     uint32_t val_bits = (static_cast<uint32_t>(bfloat16) << 16);
-    return *reinterpret_cast<float *>(&val_bits);
+    return *reinterpret_cast<float*>(&val_bits);
 }
 
-void bf16_to_f32_vec(uint16_t *src, float *dst, int64_t n) {
+void bf16_to_f32_vec(uint16_t* src, float* dst, int64_t n) {
     // support inplace op
     for (int64_t i = n - 1; i >= 0; i--) {
         dst[i] = bf16_to_f32(src[i]);
     }
 }
 
-void convert_tensor(void *src, ggml_type src_type, void *dst, ggml_type dst_type, int n) {
+void convert_tensor(void* src, ggml_type src_type, void* dst, ggml_type dst_type, int n) {
     if (src_type == dst_type) {
         size_t nbytes = n * ggml_type_size(src_type) / ggml_blck_size(src_type);
-        memcpy(((char *) dst), ((char *) src), nbytes);
+        memcpy(((char*)dst), ((char*)src), nbytes);
     } else if (src_type == GGML_TYPE_F32) {
         if (dst_type == GGML_TYPE_F16) {
-            ggml_fp32_to_fp16_row((float *) src, (ggml_fp16_t *) dst, n);
+            ggml_fp32_to_fp16_row((float*)src, (ggml_fp16_t*)dst, n);
         } else {
             int64_t hist[16];
-            ggml_quantize_chunk(dst_type, (float *) src, dst, 0, n, hist);
+            ggml_quantize_chunk(dst_type, (float*)src, dst, 0, n, hist);
         }
     } else if (dst_type == GGML_TYPE_F32) {
         if (src_type == GGML_TYPE_F16) {
-            ggml_fp16_to_fp32_row((ggml_fp16_t *) src, (float *) dst, n);
+            ggml_fp16_to_fp32_row((ggml_fp16_t*)src, (float*)dst, n);
         } else {
             auto qtype = ggml_internal_get_type_traits(src_type);
             if (qtype.to_float == NULL) {
                 throw std::runtime_error(
-                        format("type %s unsupported for integer quantization: no dequantization available",
-                               ggml_type_name(src_type)));
+                    format("type %s unsupported for integer quantization: no dequantization available",
+                           ggml_type_name(src_type)));
             }
-            qtype.to_float(src, (float *) dst, n);
+            qtype.to_float(src, (float*)dst, n);
         }
     } else {
         // src_type == GGML_TYPE_F16 => dst_type is quantized
@@ -525,13 +523,13 @@ void convert_tensor(void *src, ggml_type src_type, void *dst, ggml_type dst_type
         }
         std::vector<char> buf;
         buf.resize(sizeof(float) * n);
-        char *src_data_f32 = buf.data();
-        qtype.to_float(src, (float *) src_data_f32, n);
+        char* src_data_f32 = buf.data();
+        qtype.to_float(src, (float*)src_data_f32, n);
         if (dst_type == GGML_TYPE_F16) {
-            ggml_fp32_to_fp16_row((float *) src_data_f32, (ggml_fp16_t *) dst, n);
+            ggml_fp32_to_fp16_row((float*)src_data_f32, (ggml_fp16_t*)dst, n);
         } else {
             int64_t hist[16];
-            ggml_quantize_chunk(dst_type, (float *) src_data_f32, dst, 0, n, hist);
+            ggml_quantize_chunk(dst_type, (float*)src_data_f32, dst, 0, n, hist);
         }
     }
 }
@@ -569,7 +567,7 @@ std::map<char, int> unicode_to_byte() {
     // byte_decoder = {v: k for k, v in byte_encoder.items()}
     std::map<char, int> byte_decoder;
 
-    for (const auto &entry: byte_to_unicode) {
+    for (const auto& entry : byte_to_unicode) {
         byte_decoder[entry.second] = entry.first;
     }
 
@@ -578,8 +576,8 @@ std::map<char, int> unicode_to_byte() {
     return byte_decoder;
 }
 
-bool is_zip_file(const std::string &file_path) {
-    struct zip_t *zip = zip_open(file_path.c_str(), 0, 'r');
+bool is_zip_file(const std::string& file_path) {
+    struct zip_t* zip = zip_open(file_path.c_str(), 0, 'r');
     if (zip == NULL) {
         return false;
     }
@@ -587,7 +585,7 @@ bool is_zip_file(const std::string &file_path) {
     return true;
 }
 
-bool is_gguf_file(const std::string &file_path) {
+bool is_gguf_file(const std::string& file_path) {
     std::ifstream file(file_path, std::ios::binary);
     if (!file.is_open()) {
         return false;
@@ -608,7 +606,7 @@ bool is_gguf_file(const std::string &file_path) {
     return true;
 }
 
-bool is_safetensors_file(const std::string &file_path) {
+bool is_safetensors_file(const std::string& file_path) {
     std::ifstream file(file_path, std::ios::binary);
     if (!file.is_open()) {
         return false;
@@ -625,7 +623,7 @@ bool is_safetensors_file(const std::string &file_path) {
     }
 
     uint8_t header_size_buf[ST_HEADER_SIZE_LEN];
-    file.read((char *) header_size_buf, ST_HEADER_SIZE_LEN);
+    file.read((char*)header_size_buf, ST_HEADER_SIZE_LEN);
     if (!file) {
         return false;
     }
@@ -650,7 +648,7 @@ bool is_safetensors_file(const std::string &file_path) {
     return true;
 }
 
-bool ModelLoader::init_from_file(const std::string &file_path, const std::string &prefix) {
+bool ModelLoader::init_from_file(const std::string& file_path, const std::string& prefix) {
     if (is_directory(file_path)) {
         LOG_INFO("load %s using diffusers format", file_path.c_str());
         return init_from_diffusers_file(file_path, prefix);
@@ -671,14 +669,14 @@ bool ModelLoader::init_from_file(const std::string &file_path, const std::string
 
 /*================================================= GGUFModelLoader ==================================================*/
 
-bool ModelLoader::init_from_gguf_file(const std::string &file_path, const std::string &prefix) {
+bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::string& prefix) {
     LOG_DEBUG("init from '%s'", file_path.c_str());
     file_paths_.push_back(file_path);
     size_t file_index = file_paths_.size() - 1;
 
-    gguf_context *ctx_gguf_ = NULL;
-    ggml_context *ctx_meta_ = NULL;
-    ctx_gguf_ = gguf_init_from_file(file_path.c_str(), {true, &ctx_meta_});
+    gguf_context* ctx_gguf_ = NULL;
+    ggml_context* ctx_meta_ = NULL;
+    ctx_gguf_               = gguf_init_from_file(file_path.c_str(), {true, &ctx_meta_});
     if (!ctx_gguf_) {
         LOG_ERROR("failed to open '%s'", file_path.c_str());
         return false;
@@ -686,12 +684,12 @@ bool ModelLoader::init_from_gguf_file(const std::string &file_path, const std::s
 
     int n_tensors = gguf_get_n_tensors(ctx_gguf_);
 
-    size_t total_size = 0;
+    size_t total_size  = 0;
     size_t data_offset = gguf_get_data_offset(ctx_gguf_);
     for (int i = 0; i < n_tensors; i++) {
-        std::string name = gguf_get_tensor_name(ctx_gguf_, i);
-        struct ggml_tensor *dummy = ggml_get_tensor(ctx_meta_, name.c_str());
-        size_t offset = data_offset + gguf_get_tensor_offset(ctx_gguf_, i);
+        std::string name          = gguf_get_tensor_name(ctx_gguf_, i);
+        struct ggml_tensor* dummy = ggml_get_tensor(ctx_meta_, name.c_str());
+        size_t offset             = data_offset + gguf_get_tensor_offset(ctx_gguf_, i);
 
         // LOG_DEBUG("%s", name.c_str());
 
@@ -710,7 +708,7 @@ bool ModelLoader::init_from_gguf_file(const std::string &file_path, const std::s
 
 /*================================================= SafeTensorsModelLoader ==================================================*/
 
-ggml_type str_to_ggml_type(const std::string &dtype) {
+ggml_type str_to_ggml_type(const std::string& dtype) {
     ggml_type ttype = GGML_TYPE_COUNT;
     if (dtype == "F16") {
         ttype = GGML_TYPE_F16;
@@ -723,7 +721,7 @@ ggml_type str_to_ggml_type(const std::string &dtype) {
 }
 
 // https://huggingface.co/docs/safetensors/index
-bool ModelLoader::init_from_safetensors_file(const std::string &file_path, const std::string &prefix) {
+bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const std::string& prefix) {
     LOG_DEBUG("init from '%s'", file_path.c_str());
     file_paths_.push_back(file_path);
     size_t file_index = file_paths_.size() - 1;
@@ -745,7 +743,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string &file_path, const
     }
 
     uint8_t header_size_buf[ST_HEADER_SIZE_LEN];
-    file.read((char *) header_size_buf, ST_HEADER_SIZE_LEN);
+    file.read((char*)header_size_buf, ST_HEADER_SIZE_LEN);
     if (!file) {
         LOG_ERROR("read safetensors header size failed: '%s'", file_path.c_str());
         return false;
@@ -769,8 +767,8 @@ bool ModelLoader::init_from_safetensors_file(const std::string &file_path, const
 
     nlohmann::json header_ = nlohmann::json::parse(header_buf.data());
 
-    for (auto &item: header_.items()) {
-        std::string name = item.key();
+    for (auto& item : header_.items()) {
+        std::string name           = item.key();
         nlohmann::json tensor_info = item.value();
         // LOG_DEBUG("%s %s\n", name.c_str(), tensor_info.dump().c_str());
 
@@ -782,11 +780,11 @@ bool ModelLoader::init_from_safetensors_file(const std::string &file_path, const
             continue;
         }
 
-        std::string dtype = tensor_info["dtype"];
+        std::string dtype    = tensor_info["dtype"];
         nlohmann::json shape = tensor_info["shape"];
 
         size_t begin = tensor_info["data_offsets"][0].get<size_t>();
-        size_t end = tensor_info["data_offsets"][1].get<size_t>();
+        size_t end   = tensor_info["data_offsets"][1].get<size_t>();
 
         ggml_type type = str_to_ggml_type(dtype);
         if (type == GGML_TYPE_COUNT) {
@@ -799,7 +797,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string &file_path, const
             return false;
         }
 
-        int n_dims = (int) shape.size();
+        int n_dims    = (int)shape.size();
         int64_t ne[4] = {1, 1, 1, 1};
         for (int i = 0; i < n_dims; i++) {
             ne[i] = shape[i].get<int64_t>();
@@ -827,9 +825,9 @@ bool ModelLoader::init_from_safetensors_file(const std::string &file_path, const
 
 /*================================================= DiffusersModelLoader ==================================================*/
 
-bool ModelLoader::init_from_diffusers_file(const std::string &file_path, const std::string &prefix) {
+bool ModelLoader::init_from_diffusers_file(const std::string& file_path, const std::string& prefix) {
     std::string unet_path = path_join(file_path, "unet/diffusion_pytorch_model.safetensors");
-    std::string vae_path = path_join(file_path, "vae/diffusion_pytorch_model.safetensors");
+    std::string vae_path  = path_join(file_path, "vae/diffusion_pytorch_model.safetensors");
     std::string clip_path = path_join(file_path, "text_encoder/model.safetensors");
 
     if (!init_from_safetensors_file(unet_path, "unet.")) {
@@ -944,7 +942,7 @@ struct PickleTensorReader {
         CHECK_SIZE,
         READ_DIMENS
     };
-    ReadPhase phase = READ_NAME;
+    ReadPhase phase   = READ_NAME;
     size_t entry_size = 0;
     int32_t nelements = 0;
 
@@ -957,14 +955,14 @@ struct PickleTensorReader {
         if (phase == CHECK_SIZE) {
             if (entry_size == value * ggml_type_size(tensor_storage.type)) {
                 nelements = value;
-                phase = READ_DIMENS;
+                phase     = READ_DIMENS;
                 return true;
             } else {
                 phase = READ_NAME;
             }
         } else if (phase == READ_DIMENS) {
             if (tensor_storage.n_dims + 1 > 4) {  // too many dimens
-                phase = READ_NAME;
+                phase                 = READ_NAME;
                 tensor_storage.n_dims = 0;
             }
             if (nelements % value == 0) {
@@ -975,23 +973,23 @@ struct PickleTensorReader {
         return false;
     }
 
-    void read_global(const std::string &str) {
+    void read_global(const std::string& str) {
         if (str == "FloatStorage") {
             if (read_global_type) {
-                global_type = GGML_TYPE_F32;
+                global_type      = GGML_TYPE_F32;
                 read_global_type = false;
             }
             tensor_storage.type = GGML_TYPE_F32;
         } else if (str == "HalfStorage") {
             if (read_global_type) {
-                global_type = GGML_TYPE_F16;
+                global_type      = GGML_TYPE_F16;
                 read_global_type = false;
             }
             tensor_storage.type = GGML_TYPE_F16;
         }
     }
 
-    void read_string(const std::string &str, struct zip_t *zip, std::string dir) {
+    void read_string(const std::string& str, struct zip_t* zip, std::string dir) {
         if (str == "storage") {
             read_global_type = true;
         } else if (str != "state_dict") {
@@ -1004,8 +1002,8 @@ struct PickleTensorReader {
                     {
                         std::string name = zip_entry_name(zip);
                         if (name == entry_name) {
-                            tensor_storage.index_in_zip = (int) i;
-                            entry_size = zip_entry_size(zip);
+                            tensor_storage.index_in_zip = (int)i;
+                            entry_size                  = zip_entry_size(zip);
                             zip_entry_close(zip);
                             break;
                         }
@@ -1017,7 +1015,7 @@ struct PickleTensorReader {
             }
             if (!read_global_type && phase == READ_NAME) {
                 tensor_storage.name = str;
-                phase = READ_DATA;
+                phase               = READ_DATA;
                 tensor_storage.type = global_type;
             }
         }
@@ -1027,7 +1025,7 @@ struct PickleTensorReader {
 ggml_type PickleTensorReader::global_type = GGML_TYPE_F32;  // all pickle_tensors data type
 bool PickleTensorReader::read_global_type = false;
 
-int find_char(uint8_t *buffer, int len, char c) {
+int find_char(uint8_t* buffer, int len, char c) {
     for (int pos = 0; pos < len; pos++) {
         if (buffer[pos] == c) {
             return pos;
@@ -1038,13 +1036,13 @@ int find_char(uint8_t *buffer, int len, char c) {
 
 #define MAX_STRING_BUFFER 512
 
-bool ModelLoader::parse_data_pkl(uint8_t *buffer,
+bool ModelLoader::parse_data_pkl(uint8_t* buffer,
                                  size_t buffer_size,
-                                 zip_t *zip,
+                                 zip_t* zip,
                                  std::string dir,
                                  size_t file_index,
-                                 const std::string &prefix) {
-    uint8_t *buffer_end = buffer + buffer_size;
+                                 const std::string& prefix) {
+    uint8_t* buffer_end = buffer + buffer_size;
     if (buffer[0] == 0x80) {  // proto
         if (buffer[1] != 2) {
             LOG_ERROR("Unsupported protocol\n");
@@ -1088,8 +1086,7 @@ bool ModelLoader::parse_data_pkl(uint8_t *buffer,
                         buffer++;
                     }
                     buffer++;
-                }
-                    break;
+                } break;
                 case 'M':  // BININT2        = b'M'   # push 2-byte unsigned int
                 {
                     uint16_t value = read_short(buffer);
@@ -1097,8 +1094,7 @@ bool ModelLoader::parse_data_pkl(uint8_t *buffer,
                         buffer++;
                     }
                     buffer += 2;
-                }
-                    break;
+                } break;
                 case 'J':  // BININT         = b'J'   # push four-byte signed int
                 {
                     const int32_t value = read_int(buffer);
@@ -1106,8 +1102,7 @@ bool ModelLoader::parse_data_pkl(uint8_t *buffer,
                         buffer++;  // skip tuple after read num_elements
                     }
                     buffer += 4;
-                }
-                    break;
+                } break;
                 case 'X':  // BINUNICODE     = b'X'   #   "     "       "  ; counted UTF-8 string argument
                 {
                     const int32_t len = read_int(buffer);
@@ -1119,8 +1114,7 @@ bool ModelLoader::parse_data_pkl(uint8_t *buffer,
                     memcpy(string_buffer, buffer, len < MAX_STRING_BUFFER ? len : (MAX_STRING_BUFFER - 1));
                     buffer += len;
                     reader.read_string(string_buffer, zip, dir);
-                }
-                    break;
+                } break;
                 case 0x8C:  // SHORT_BINUNICODE = b'\x8c'  # push short string; UTF-8 length < 256 bytes
                 {
                     const int8_t len = *buffer;
@@ -1129,8 +1123,7 @@ bool ModelLoader::parse_data_pkl(uint8_t *buffer,
                     memcpy(string_buffer, buffer, len);
                     buffer += len;
                     // printf("String: '%s'\n", string_buffer);
-                }
-                    break;
+                } break;
                 case 'c':  // GLOBAL         = b'c'   # push self.find_class(modname, name); 2 string args
                 {
                     int len = find_char(buffer, MAX_STRING_BUFFER, '\n');
@@ -1142,15 +1135,14 @@ bool ModelLoader::parse_data_pkl(uint8_t *buffer,
                     memcpy(string_buffer, buffer, len);
                     buffer += len + 1;
                     reader.read_global(string_buffer);
-                }
-                    break;
+                } break;
                 case 0x86:  // TUPLE2         = b'\x86'  # build 2-tuple from two topmost stack items
                 case 0x85:  // TUPLE1         = b'\x85'  # build 1-tuple from stack top
                 case 't':   // TUPLE          = b't'   # build tuple from topmost stack items
                     if (reader.phase == PickleTensorReader::READ_DIMENS) {
                         reader.tensor_storage.reverse_ne();
                         reader.tensor_storage.file_index = file_index;
-                        reader.tensor_storage.name = prefix + reader.tensor_storage.name;
+                        reader.tensor_storage.name       = prefix + reader.tensor_storage.name;
                         tensor_storages.push_back(reader.tensor_storage);
                         // LOG_DEBUG("%s", reader.tensor_storage.name.c_str());
                         // reset
@@ -1168,31 +1160,31 @@ bool ModelLoader::parse_data_pkl(uint8_t *buffer,
     return true;
 }
 
-bool ModelLoader::init_from_ckpt_file(const std::string &file_path, const std::string &prefix) {
+bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::string& prefix) {
     LOG_DEBUG("init from '%s'", file_path.c_str());
     file_paths_.push_back(file_path);
     size_t file_index = file_paths_.size() - 1;
 
-    struct zip_t *zip = zip_open(file_path.c_str(), 0, 'r');
+    struct zip_t* zip = zip_open(file_path.c_str(), 0, 'r');
     if (zip == NULL) {
         LOG_ERROR("failed to open '%s'", file_path.c_str());
         return false;
     }
-    int n = (int) zip_entries_total(zip);
+    int n = (int)zip_entries_total(zip);
     for (int i = 0; i < n; ++i) {
         zip_entry_openbyindex(zip, i);
         {
             std::string name = zip_entry_name(zip);
-            size_t pos = name.find("data.pkl");
+            size_t pos       = name.find("data.pkl");
             if (pos != std::string::npos) {
                 std::string dir = name.substr(0, pos);
-                void *pkl_data = NULL;
+                void* pkl_data  = NULL;
                 size_t pkl_size;
                 zip_entry_read(zip, &pkl_data, &pkl_size);
 
                 // LOG_DEBUG("%lld", pkl_size);
 
-                parse_data_pkl((uint8_t *) pkl_data, pkl_size, zip, dir, file_index, prefix);
+                parse_data_pkl((uint8_t*)pkl_data, pkl_size, zip, dir, file_index, prefix);
 
                 free(pkl_data);
             }
@@ -1206,7 +1198,7 @@ bool ModelLoader::init_from_ckpt_file(const std::string &file_path, const std::s
 SDVersion ModelLoader::get_sd_version() {
     // return VERSION_1_x;
     TensorStorage token_embedding_weight;
-    for (auto &tensor_storage: tensor_storages) {
+    for (auto& tensor_storage : tensor_storages) {
         if (tensor_storage.name.find("conditioner.embedders.1") != std::string::npos) {
             return VERSION_XL;
         }
@@ -1232,7 +1224,7 @@ SDVersion ModelLoader::get_sd_version() {
 }
 
 ggml_type ModelLoader::get_sd_wtype() {
-    for (auto &tensor_storage: tensor_storages) {
+    for (auto& tensor_storage : tensor_storages) {
         if (is_unused_tensor(tensor_storage.name)) {
             continue;
         }
@@ -1246,16 +1238,16 @@ ggml_type ModelLoader::get_sd_wtype() {
 }
 
 std::string ModelLoader::load_merges() {
-    std::string merges_utf8_str(reinterpret_cast<const char *>(merges_utf8_c_str), sizeof(merges_utf8_c_str));
+    std::string merges_utf8_str(reinterpret_cast<const char*>(merges_utf8_c_str), sizeof(merges_utf8_c_str));
     return merges_utf8_str;
 }
 
-void remove_duplicates(std::vector<TensorStorage> &vec) {
+void remove_duplicates(std::vector<TensorStorage>& vec) {
     std::unordered_map<std::string, size_t> name_to_index_map;
 
     for (size_t i = 0; i < vec.size(); ++i) {
-        const std::string &current_name = vec[i].name;
-        auto it = name_to_index_map.find(current_name);
+        const std::string& current_name = vec[i].name;
+        auto it                         = name_to_index_map.find(current_name);
 
         if (it != name_to_index_map.end()) {
             vec[it->second] = vec[i];
@@ -1269,7 +1261,7 @@ void remove_duplicates(std::vector<TensorStorage> &vec) {
 
 bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend) {
     std::vector<TensorStorage> processed_tensor_storages;
-    for (auto &tensor_storage: tensor_storages) {
+    for (auto& tensor_storage : tensor_storages) {
         // LOG_DEBUG("%s", name.c_str());
 
         if (is_unused_tensor(tensor_storage.name)) {
@@ -1291,7 +1283,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
         }
 
         bool is_zip = false;
-        for (auto &tensor_storage: tensor_storages) {
+        for (auto& tensor_storage : tensor_storages) {
             if (tensor_storage.file_index != file_index) {
                 continue;
             }
@@ -1301,7 +1293,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
             }
         }
 
-        struct zip_t *zip = NULL;
+        struct zip_t* zip = NULL;
         if (is_zip) {
             zip = zip_open(file_path.c_str(), 0, 'r');
             if (zip == NULL) {
@@ -1313,16 +1305,16 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
         std::vector<uint8_t> read_buffer;
         std::vector<uint8_t> convert_buffer;
 
-        auto read_data = [&](const TensorStorage &tensor_storage, char *buf, size_t n) {
+        auto read_data = [&](const TensorStorage& tensor_storage, char* buf, size_t n) {
             if (zip != NULL) {
                 zip_entry_openbyindex(zip, tensor_storage.index_in_zip);
                 size_t entry_size = zip_entry_size(zip);
                 if (entry_size != n) {
                     read_buffer.resize(entry_size);
-                    zip_entry_noallocread(zip, (void *) read_buffer.data(), entry_size);
-                    memcpy((void *) buf, (void *) (read_buffer.data() + tensor_storage.offset), n);
+                    zip_entry_noallocread(zip, (void*)read_buffer.data(), entry_size);
+                    memcpy((void*)buf, (void*)(read_buffer.data() + tensor_storage.offset), n);
                 } else {
-                    zip_entry_noallocread(zip, (void *) buf, n);
+                    zip_entry_noallocread(zip, (void*)buf, n);
                 }
                 zip_entry_close(zip);
             } else {
@@ -1336,13 +1328,13 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
             return true;
         };
 
-        for (auto &tensor_storage: processed_tensor_storages) {
+        for (auto& tensor_storage : processed_tensor_storages) {
             if (tensor_storage.file_index != file_index) {
                 continue;
             }
             // LOG_DEBUG("%s", tensor_storage.name.c_str());
 
-            ggml_tensor *dst_tensor = NULL;
+            ggml_tensor* dst_tensor = NULL;
 
             success = on_new_tensor_cb(tensor_storage, &dst_tensor);
             if (!success) {
@@ -1360,37 +1352,37 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
 #ifdef SD_USE_METAL
                 || ggml_backend_is_metal(backend)
 #endif
-                    ) {
+            ) {
                 // for the CPU and Metal backend, we can copy directly into the tensor
                 if (tensor_storage.type == dst_tensor->type) {
                     GGML_ASSERT(ggml_nbytes(dst_tensor) == tensor_storage.nbytes());
-                    read_data(tensor_storage, (char *) dst_tensor->data, nbytes_to_read);
+                    read_data(tensor_storage, (char*)dst_tensor->data, nbytes_to_read);
 
                     if (tensor_storage.is_bf16) {
                         // inplace op
-                        bf16_to_f32_vec((uint16_t *) dst_tensor->data, (float *) dst_tensor->data,
+                        bf16_to_f32_vec((uint16_t*)dst_tensor->data, (float*)dst_tensor->data,
                                         tensor_storage.nelements());
                     }
                 } else {
                     read_buffer.resize(tensor_storage.nbytes());
-                    read_data(tensor_storage, (char *) read_buffer.data(), nbytes_to_read);
+                    read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);
 
                     if (tensor_storage.is_bf16) {
                         // inplace op
-                        bf16_to_f32_vec((uint16_t *) read_buffer.data(), (float *) read_buffer.data(),
+                        bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(),
                                         tensor_storage.nelements());
                     }
 
-                    convert_tensor((void *) read_buffer.data(), tensor_storage.type, dst_tensor->data,
-                                   dst_tensor->type, (int) tensor_storage.nelements());
+                    convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data,
+                                   dst_tensor->type, (int)tensor_storage.nelements());
                 }
             } else {
                 read_buffer.resize(tensor_storage.nbytes());
-                read_data(tensor_storage, (char *) read_buffer.data(), nbytes_to_read);
+                read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);
 
                 if (tensor_storage.is_bf16) {
                     // inplace op
-                    bf16_to_f32_vec((uint16_t *) read_buffer.data(), (float *) read_buffer.data(),
+                    bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(),
                                     tensor_storage.nelements());
                 }
 
@@ -1400,9 +1392,9 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
                 } else {
                     // convert first, then copy to device memory
                     convert_buffer.resize(ggml_nbytes(dst_tensor));
-                    convert_tensor((void *) read_buffer.data(), tensor_storage.type,
-                                   (void *) convert_buffer.data(), dst_tensor->type,
-                                   (int) tensor_storage.nelements());
+                    convert_tensor((void*)read_buffer.data(), tensor_storage.type,
+                                   (void*)convert_buffer.data(), dst_tensor->type,
+                                   (int)tensor_storage.nelements());
                     ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor));
                 }
             }
@@ -1419,16 +1411,16 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
     return success;
 }
 
-bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor *> &tensors,
+bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
                                ggml_backend_t backend,
                                std::set<std::string> ignore_tensors,
                                bool standalone) {
     std::set<std::string> tensor_names_in_file;
-    auto on_new_tensor_cb = [&](const TensorStorage &tensor_storage, ggml_tensor **dst_tensor) -> bool {
-        const std::string &name = tensor_storage.name;
+    auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
+        const std::string& name = tensor_storage.name;
         tensor_names_in_file.insert(name);
 
-        struct ggml_tensor *real;
+        struct ggml_tensor* real;
         if (tensors.find(name) != tensors.end()) {
             real = tensors[name];
         } else {
@@ -1443,17 +1435,17 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor *> &tens
         }
 
         if (
-                real->ne[0] != tensor_storage.ne[0] ||
-                real->ne[1] != tensor_storage.ne[1] ||
-                real->ne[2] != tensor_storage.ne[2] ||
-                real->ne[3] != tensor_storage.ne[3]) {
+            real->ne[0] != tensor_storage.ne[0] ||
+            real->ne[1] != tensor_storage.ne[1] ||
+            real->ne[2] != tensor_storage.ne[2] ||
+            real->ne[3] != tensor_storage.ne[3]) {
             LOG_ERROR(
-                    "tensor '%s' has wrong shape in model file: "
-                    "got [%d, %d, %d, %d], expected [%d, %d, %d, %d]",
-                    name.c_str(),
-                    (int) tensor_storage.ne[0], (int) tensor_storage.ne[1], (int) tensor_storage.ne[2],
-                    (int) tensor_storage.ne[3],
-                    (int) real->ne[0], (int) real->ne[1], (int) real->ne[2], (int) real->ne[3]);
+                "tensor '%s' has wrong shape in model file: "
+                "got [%d, %d, %d, %d], expected [%d, %d, %d, %d]",
+                name.c_str(),
+                (int)tensor_storage.ne[0], (int)tensor_storage.ne[1], (int)tensor_storage.ne[2],
+                (int)tensor_storage.ne[3],
+                (int)real->ne[0], (int)real->ne[1], (int)real->ne[2], (int)real->ne[3]);
             return false;
         }
 
@@ -1470,7 +1462,7 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor *> &tens
 
     bool some_tensor_not_init = false;
 
-    for (auto pair: tensors) {
+    for (auto pair : tensors) {
         if (pair.first.find("cond_stage_model.transformer.text_model.encoder.layers.23") != std::string::npos) {
             continue;
         }
@@ -1494,18 +1486,18 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor *> &tens
     return true;
 }
 
-bool ModelLoader::save_to_gguf_file(const std::string &file_path, ggml_type type) {
-    auto backend = ggml_backend_cpu_init();
+bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type) {
+    auto backend    = ggml_backend_cpu_init();
     size_t mem_size = 1 * 1024 * 1024;  // for padding
     mem_size += tensor_storages.size() * ggml_tensor_overhead();
     mem_size += cal_mem_size(backend, type);
     LOG_INFO("model tensors mem size: %.2fMB", mem_size / 1024.f / 1024.f);
-    ggml_context *ggml_ctx = ggml_init({mem_size, NULL, false});
+    ggml_context* ggml_ctx = ggml_init({mem_size, NULL, false});
 
-    gguf_context *gguf_ctx = gguf_init_empty();
+    gguf_context* gguf_ctx = gguf_init_empty();
 
-    auto on_new_tensor_cb = [&](const TensorStorage &tensor_storage, ggml_tensor **dst_tensor) -> bool {
-        const std::string &name = tensor_storage.name;
+    auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
+        const std::string& name = tensor_storage.name;
 
         ggml_type tensor_type = tensor_storage.type;
         if (type != GGML_TYPE_COUNT) {
@@ -1516,7 +1508,7 @@ bool ModelLoader::save_to_gguf_file(const std::string &file_path, ggml_type type
             }
         }
 
-        ggml_tensor *tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne);
+        ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne);
         if (tensor == NULL) {
             LOG_ERROR("ggml_new_tensor failed");
             return false;
@@ -1555,14 +1547,14 @@ int64_t ModelLoader::cal_mem_size(ggml_backend_t backend, ggml_type type) {
     }
     int64_t mem_size = 0;
     std::vector<TensorStorage> processed_tensor_storages;
-    for (auto &tensor_storage: tensor_storages) {
+    for (auto& tensor_storage : tensor_storages) {
         if (is_unused_tensor(tensor_storage.name)) {
             continue;
         }
         preprocess_tensor(tensor_storage, processed_tensor_storages);
     }
 
-    for (auto &tensor_storage: processed_tensor_storages) {
+    for (auto& tensor_storage : processed_tensor_storages) {
         ggml_type tensor_type = tensor_storage.type;
         if (type != GGML_TYPE_COUNT) {
             if (ggml_is_quantized(type) && tensor_storage.ne[0] % 32 != 0) {
@@ -1578,7 +1570,7 @@ int64_t ModelLoader::cal_mem_size(ggml_backend_t backend, ggml_type type) {
     return mem_size;
 }
 
-bool convert(const char *input_path, const char *vae_path, const char *output_path, sd_type_t output_type) {
+bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type) {
     ModelLoader model_loader;
 
     if (!model_loader.init_from_file(input_path)) {
@@ -1592,6 +1584,6 @@ bool convert(const char *input_path, const char *vae_path, const char *output_pa
             return false;
         }
     }
-    bool success = model_loader.save_to_gguf_file(output_path, (ggml_type) output_type);
+    bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type);
     return success;
 }
\ No newline at end of file
diff --git a/model.h b/model.h
index b0d61547..86f7649c 100644
--- a/model.h
+++ b/model.h
@@ -120,7 +120,8 @@ class ModelLoader {
     bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend);
     bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
                       ggml_backend_t backend,
-                      std::set<std::string> ignore_tensors = {}, bool standalone=true);
+                      std::set<std::string> ignore_tensors = {},
+                      bool standalone                      = true);
     bool save_to_gguf_file(const std::string& file_path, ggml_type type);
     int64_t cal_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
     ~ModelLoader() = default;
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 3954e326..28f5d8c8 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -14,60 +14,59 @@
 #include "unet.hpp"
 #include "vae.hpp"
 
-const char *model_version_to_str[] = {
-        "1.x",
-        "2.x",
-        "XL",
+const char* model_version_to_str[] = {
+    "1.x",
+    "2.x",
+    "XL",
 };
 
-const char *sampling_methods_str[] = {
-        "Euler A",
-        "Euler",
-        "Heun",
-        "DPM2",
-        "DPM++ (2s)",
-        "DPM++ (2M)",
-        "modified DPM++ (2M)",
-        "LCM",
+const char* sampling_methods_str[] = {
+    "Euler A",
+    "Euler",
+    "Heun",
+    "DPM2",
+    "DPM++ (2s)",
+    "DPM++ (2M)",
+    "modified DPM++ (2M)",
+    "LCM",
 };
 
 /*================================================== Helper Functions ================================================*/
 
-void calculate_alphas_cumprod(float *alphas_cumprod,
+void calculate_alphas_cumprod(float* alphas_cumprod,
                               float linear_start = 0.00085f,
-                              float linear_end = 0.0120,
-                              int timesteps = TIMESTEPS) {
+                              float linear_end   = 0.0120,
+                              int timesteps      = TIMESTEPS) {
     float ls_sqrt = sqrtf(linear_start);
     float le_sqrt = sqrtf(linear_end);
-    float amount = le_sqrt - ls_sqrt;
+    float amount  = le_sqrt - ls_sqrt;
     float product = 1.0f;
     for (int i = 0; i < timesteps; i++) {
-        float beta = ls_sqrt + amount * ((float) i / (timesteps - 1));
+        float beta = ls_sqrt + amount * ((float)i / (timesteps - 1));
         product *= 1.0f - powf(beta, 2.0f);
         alphas_cumprod[i] = product;
     }
 }
 
-
 /*=============================================== StableDiffusionGGML ================================================*/
 
 class StableDiffusionGGML {
 public:
     SDVersion version;
-    bool vae_decode_only = false;
+    bool vae_decode_only         = false;
     bool free_params_immediately = false;
 
     std::shared_ptr<RNG> rng = std::make_shared<STDDefaultRNG>();
-    int n_threads = -1;
-    float scale_factor = 0.18215f;
+    int n_threads            = -1;
+    float scale_factor       = 0.18215f;
 
     FrozenCLIPEmbedderWithCustomWords cond_stage_model;
     UNetModel diffusion_model;
     AutoEncoderKL first_stage_model;
     bool use_tiny_autoencoder = false;
-    bool vae_tiling = false;
+    bool vae_tiling           = false;
 
-    std::map<std::string, struct ggml_tensor *> tensors;
+    std::map<std::string, struct ggml_tensor*> tensors;
 
     std::string lora_model_dir;
     // lora_name => multiplier
@@ -75,11 +74,11 @@ class StableDiffusionGGML {
     std::map<std::string, LoraModel> loras;
 
     std::shared_ptr<Denoiser> denoiser = std::make_shared<CompVisDenoiser>();
-    schedule_t schedule = DEFAULT;
+    schedule_t schedule                = DEFAULT;
 
-    ggml_backend_t backend = NULL;  // general backend
-    ggml_type model_data_type = GGML_TYPE_COUNT; // runtime weight type
-    ggml_type wtype = GGML_TYPE_COUNT; // options weight type
+    ggml_backend_t backend    = NULL;             // general backend
+    ggml_type model_data_type = GGML_TYPE_COUNT;  // runtime weight type
+    ggml_type wtype           = GGML_TYPE_COUNT;  // options weight type
 
     TinyAutoEncoder tae_first_stage;
     std::string taesd_path;
@@ -97,15 +96,15 @@ class StableDiffusionGGML {
                         ggml_type wtype,
                         schedule_t schedule,
                         bool init_backend_immediately = true)
-            : n_threads(n_threads),
-              vae_decode_only(vae_decode_only),
-              free_params_immediately(free_params_immediately),
-              lora_model_dir(lora_model_dir),
-              vae_tiling(vae_tiling),
-              wtype(wtype),
-              schedule(schedule) {
+        : n_threads(n_threads),
+          vae_decode_only(vae_decode_only),
+          free_params_immediately(free_params_immediately),
+          lora_model_dir(lora_model_dir),
+          vae_tiling(vae_tiling),
+          wtype(wtype),
+          schedule(schedule) {
         first_stage_model.decode_only = vae_decode_only;
-        tae_first_stage.decode_only = vae_decode_only;
+        tae_first_stage.decode_only   = vae_decode_only;
         if (rng_type == STD_DEFAULT_RNG) {
             rng = std::make_shared<STDDefaultRNG>();
         } else if (rng_type == CUDA_RNG) {
@@ -151,24 +150,23 @@ class StableDiffusionGGML {
                      rng_type_t rng_type,
                      bool vae_tiling,
                      sd_type_t wtype,
-                     schedule_t schedule
-    ) {
-        this->n_threads = n_threads;
-        this->vae_decode_only = vae_decode_only;
+                     schedule_t schedule) {
+        this->n_threads               = n_threads;
+        this->vae_decode_only         = vae_decode_only;
         this->free_params_immediately = free_params_immediately;
-        this->lora_model_dir = lora_model_dir;
+        this->lora_model_dir          = lora_model_dir;
         if (rng_type == STD_DEFAULT_RNG) {
             rng = std::make_shared<STDDefaultRNG>();
         } else if (rng_type == CUDA_RNG) {
             rng = std::make_shared<PhiloxRNG>();
         }
         this->vae_tiling = vae_tiling;
-        this->wtype = (ggml_type) wtype;
-        this->schedule = schedule;
+        this->wtype      = (ggml_type)wtype;
+        this->schedule   = schedule;
         apply_schedule();
     }
 
-    bool load_clip_from_file(const std::string &model_path, bool standalone = true, const std::string &prefix = "te.") {
+    bool load_clip_from_file(const std::string& model_path, bool standalone = true, const std::string& prefix = "te.") {
         if (backend == NULL) {
             LOG_ERROR("if you set init_backend_immediately false, please call init_backend first");
             return false;
@@ -234,11 +232,11 @@ class StableDiffusionGGML {
         }
 
         struct ggml_init_params params;
-        params.mem_size = static_cast<size_t>(3 * 1024) * 1024;  // 10M
+        params.mem_size   = static_cast<size_t>(3 * 1024) * 1024;  // 10M
         params.mem_buffer = NULL;
-        params.no_alloc = false;
+        params.no_alloc   = false;
         // LOG_DEBUG("mem_size %u ", params.mem_size);
-        struct ggml_context *ctx = ggml_init(params);  // for  alphas_cumprod and is_using_v_parameterization check
+        struct ggml_context* ctx = ggml_init(params);  // for  alphas_cumprod and is_using_v_parameterization check
         if (!ctx) {
             LOG_ERROR("ggml_init() failed");
             return false;
@@ -248,10 +246,10 @@ class StableDiffusionGGML {
         LOG_DEBUG("loading clip weights");
         int64_t t0 = ggml_time_ms();
 
-        std::map<std::string, struct ggml_tensor *> tensors_need_to_load;
+        std::map<std::string, struct ggml_tensor*> tensors_need_to_load;
         std::set<std::string> ignore_tensors;
 
-        for (auto &pair: tensors) {
+        for (auto& pair : tensors) {
             tensors_need_to_load.insert(pair);
         }
 
@@ -275,9 +273,9 @@ class StableDiffusionGGML {
         }
     }
 
-    bool load_unet_from_file(const std::string &model_path,
-                             bool standalone = true,
-                             const std::string &prefix = "unet.") {
+    bool load_unet_from_file(const std::string& model_path,
+                             bool standalone           = true,
+                             const std::string& prefix = "unet.") {
         if (backend == NULL) {
             LOG_ERROR("if you set init_backend_immediately false, please call init_backend first");
             return false;
@@ -310,11 +308,11 @@ class StableDiffusionGGML {
         }
 
         struct ggml_init_params params;
-        params.mem_size = static_cast<size_t>(3 * 1024) * 1024;  // 10M
+        params.mem_size   = static_cast<size_t>(3 * 1024) * 1024;  // 10M
         params.mem_buffer = NULL;
-        params.no_alloc = false;
+        params.no_alloc   = false;
 
-        struct ggml_context *ctx = ggml_init(params);  // for  alphas_cumprod and is_using_v_parameterization check
+        struct ggml_context* ctx = ggml_init(params);  // for  alphas_cumprod and is_using_v_parameterization check
 
         if (!ctx) {
             LOG_ERROR("ggml_init() failed");
@@ -325,13 +323,13 @@ class StableDiffusionGGML {
         LOG_DEBUG("loading weights");
         int64_t t0 = ggml_time_ms();
 
-        std::map<std::string, struct ggml_tensor *> tensors_need_to_load;
+        std::map<std::string, struct ggml_tensor*> tensors_need_to_load;
         std::set<std::string> ignore_tensors;
-        ggml_tensor *alphas_cumprod_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, TIMESTEPS);
-        calculate_alphas_cumprod((float *) alphas_cumprod_tensor->data);
+        ggml_tensor* alphas_cumprod_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, TIMESTEPS);
+        calculate_alphas_cumprod((float*)alphas_cumprod_tensor->data);
         tensors_need_to_load["alphas_cumprod"] = alphas_cumprod_tensor;
-        for (auto &pair: tensors) {
-            const std::string &name = pair.first;
+        for (auto& pair : tensors) {
+            const std::string& name = pair.first;
             if (starts_with(name, "cond_stage_model.") || starts_with(name, "first_stage_model.")) {
                 ignore_tensors.insert(name);
                 continue;
@@ -367,16 +365,15 @@ class StableDiffusionGGML {
         return true;
     }
 
-
     void free_unet_params() {
         if (diffusion_model.params_buffer_size > 0) {
             diffusion_model.free_params_buffer();
         }
     }
 
-    bool load_vae_from_file(const std::string &model_path,
-                            bool standalone = true,
-                            const std::string &prefix = "vae.") {
+    bool load_vae_from_file(const std::string& model_path,
+                            bool standalone           = true,
+                            const std::string& prefix = "vae.") {
         if (backend == NULL) {
             LOG_ERROR("if you set init_backend_immediately false, please call init_backend first");
             return false;
@@ -413,11 +410,11 @@ class StableDiffusionGGML {
         }
 
         struct ggml_init_params params;
-        params.mem_size = static_cast<size_t>(10 * 1024) * 1024;  // 10M
+        params.mem_size   = static_cast<size_t>(10 * 1024) * 1024;  // 10M
         params.mem_buffer = NULL;
-        params.no_alloc = false;
+        params.no_alloc   = false;
         // LOG_DEBUG("mem_size %u ", params.mem_size);
-        struct ggml_context *ctx = ggml_init(params);  // for  alphas_cumprod and is_using_v_parameterization check
+        struct ggml_context* ctx = ggml_init(params);  // for  alphas_cumprod and is_using_v_parameterization check
         if (!ctx) {
             LOG_ERROR("ggml_init() failed");
             return false;
@@ -427,10 +424,10 @@ class StableDiffusionGGML {
         LOG_DEBUG("loading weights");
         int64_t t0 = ggml_time_ms();
 
-        std::map<std::string, struct ggml_tensor *> tensors_need_to_load;
+        std::map<std::string, struct ggml_tensor*> tensors_need_to_load;
         std::set<std::string> ignore_tensors;
-        for (auto &pair: tensors) {
-            const std::string &name = pair.first;
+        for (auto& pair : tensors) {
+            const std::string& name = pair.first;
             // TODO: make it can reload in compute time. so we can set vae_decode_only dynamic.
             if (vae_decode_only &&
                 (starts_with(name, "first_stage_model.encoder") || starts_with(name, "first_stage_model.quant"))) {
@@ -459,8 +456,8 @@ class StableDiffusionGGML {
         }
     }
 
-    //load the all model from one file
-    bool load_diffusions_from_file(const std::string &model_path) {
+    // load the all model from one file
+    bool load_diffusions_from_file(const std::string& model_path) {
         LOG_INFO("loading model from '%s'", model_path.c_str());
         if (!load_clip_from_file(model_path, false, "")) {
             free_clip_params();
@@ -494,7 +491,7 @@ class StableDiffusionGGML {
         LOG_INFO("free vae params");
     }
 
-    bool load_taesd_from_file(const std::string &taesd_path) {
+    bool load_taesd_from_file(const std::string& taesd_path) {
         if (first_stage_model.params_buffer_size > 0) {
             free_vae_params();
         }
@@ -511,34 +508,34 @@ class StableDiffusionGGML {
         }
     }
 
-    bool is_using_v_parameterization_for_sd2(ggml_context *work_ctx) {
-        struct ggml_tensor *x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1);
+    bool is_using_v_parameterization_for_sd2(ggml_context* work_ctx) {
+        struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1);
         ggml_set_f32(x_t, 0.5);
-        struct ggml_tensor *c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1);
+        struct ggml_tensor* c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1);
         ggml_set_f32(c, 0.5);
 
-        struct ggml_tensor *timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32,
-                                                           1);                                     // [N, ]
-        struct ggml_tensor *t_emb = new_timestep_embedding(work_ctx, NULL, timesteps,
-                                                           diffusion_model.model_channels);  // [N, model_channels]
+        struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32,
+                                                           1);  // [N, ]
+        struct ggml_tensor* t_emb     = new_timestep_embedding(work_ctx, NULL, timesteps,
+                                                               diffusion_model.model_channels);  // [N, model_channels]
 
         int64_t t0 = ggml_time_ms();
         ggml_set_f32(timesteps, 999);
         set_timestep_embedding(timesteps, t_emb, diffusion_model.model_channels);
-        struct ggml_tensor *out = ggml_dup_tensor(work_ctx, x_t);
+        struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t);
         diffusion_model.alloc_compute_buffer(x_t, c, t_emb);
         diffusion_model.compute(out, n_threads, x_t, NULL, c, t_emb);
         diffusion_model.free_compute_buffer();
 
         double result = 0.f;
         {
-            float *vec_x = (float *) x_t->data;
-            float *vec_out = (float *) out->data;
+            float* vec_x   = (float*)x_t->data;
+            float* vec_out = (float*)out->data;
 
             int64_t n = ggml_nelements(out);
 
             for (int i = 0; i < n; i++) {
-                result += ((double) vec_out[i] - (double) vec_x[i]);
+                result += ((double)vec_out[i] - (double)vec_x[i]);
             }
             result /= n;
         }
@@ -571,15 +568,15 @@ class StableDiffusionGGML {
 
         for (int i = 0; i < TIMESTEPS; i++) {
             denoiser->schedule->alphas_cumprod[i] = alphas_cumprod_tensor[i];
-            denoiser->schedule->sigmas[i] = std::sqrt(
-                    (1 - denoiser->schedule->alphas_cumprod[i]) / denoiser->schedule->alphas_cumprod[i]);
+            denoiser->schedule->sigmas[i]         = std::sqrt(
+                (1 - denoiser->schedule->alphas_cumprod[i]) / denoiser->schedule->alphas_cumprod[i]);
             denoiser->schedule->log_sigmas[i] = std::log(denoiser->schedule->sigmas[i]);
         }
     }
 
-    void apply_lora(const std::string &lora_name, float multiplier) {
-        int64_t t0 = ggml_time_ms();
-        std::string st_file_path = path_join(lora_model_dir, lora_name + ".safetensors");
+    void apply_lora(const std::string& lora_name, float multiplier) {
+        int64_t t0                 = ggml_time_ms();
+        std::string st_file_path   = path_join(lora_model_dir, lora_name + ".safetensors");
         std::string ckpt_file_path = path_join(lora_model_dir, lora_name + ".ckpt");
         std::string file_path;
         if (file_exists(st_file_path)) {
@@ -609,14 +606,14 @@ class StableDiffusionGGML {
                  (t1 - t0) * 1.0f / 1000);
     }
 
-    void apply_loras(const std::unordered_map<std::string, float> &lora_state) {
+    void apply_loras(const std::unordered_map<std::string, float>& lora_state) {
         if (lora_state.size() > 0 && model_data_type != GGML_TYPE_F16 && model_data_type != GGML_TYPE_F32) {
             LOG_WARN("In quantized models when applying LoRA, the images have poor quality.");
         }
         std::unordered_map<std::string, float> lora_state_diff;
-        for (auto &kv: lora_state) {
-            const std::string &lora_name = kv.first;
-            float multiplier = kv.second;
+        for (auto& kv : lora_state) {
+            const std::string& lora_name = kv.first;
+            float multiplier             = kv.second;
 
             if (curr_lora_state.find(lora_name) != curr_lora_state.end()) {
                 float curr_multiplier = curr_lora_state[lora_name];
@@ -629,35 +626,35 @@ class StableDiffusionGGML {
             }
         }
 
-        for (auto &kv: lora_state_diff) {
+        for (auto& kv : lora_state_diff) {
             apply_lora(kv.first, kv.second);
         }
 
         curr_lora_state = lora_state;
     }
 
-    std::pair<ggml_tensor *, ggml_tensor *> get_learned_condition(ggml_context *work_ctx,
-                                                                  const std::string &text,
-                                                                  int clip_skip,
-                                                                  int width,
-                                                                  int height,
-                                                                  bool force_zero_embeddings = false) {
+    std::pair<ggml_tensor*, ggml_tensor*> get_learned_condition(ggml_context* work_ctx,
+                                                                const std::string& text,
+                                                                int clip_skip,
+                                                                int width,
+                                                                int height,
+                                                                bool force_zero_embeddings = false) {
         cond_stage_model.set_clip_skip(clip_skip);
-        auto tokens_and_weights = cond_stage_model.tokenize(text, true);
-        std::vector<int> &tokens = tokens_and_weights.first;
-        std::vector<float> &weights = tokens_and_weights.second;
-        int64_t t0 = ggml_time_ms();
-        struct ggml_tensor *pooled = NULL;
-        size_t total_hidden_size = cond_stage_model.text_model.hidden_size;
+        auto tokens_and_weights     = cond_stage_model.tokenize(text, true);
+        std::vector<int>& tokens    = tokens_and_weights.first;
+        std::vector<float>& weights = tokens_and_weights.second;
+        int64_t t0                  = ggml_time_ms();
+        struct ggml_tensor* pooled  = NULL;
+        size_t total_hidden_size    = cond_stage_model.text_model.hidden_size;
         if (version == VERSION_XL) {
             total_hidden_size += cond_stage_model.text_model2.hidden_size;
             pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, cond_stage_model.text_model2.projection_dim);
         }
-        struct ggml_tensor *hidden_states = ggml_new_tensor_2d(work_ctx,
+        struct ggml_tensor* hidden_states = ggml_new_tensor_2d(work_ctx,
                                                                GGML_TYPE_F32,
                                                                total_hidden_size,
                                                                cond_stage_model.text_model.max_position_embeddings);  // [N, n_token, hidden_size]
-        cond_stage_model.alloc_compute_buffer(work_ctx, (int) tokens.size());
+        cond_stage_model.alloc_compute_buffer(work_ctx, (int)tokens.size());
         cond_stage_model.compute(n_threads, tokens, hidden_states, pooled);
         cond_stage_model.free_compute_buffer();
         // if (pooled != NULL) {
@@ -667,7 +664,7 @@ class StableDiffusionGGML {
 
         int64_t t1 = ggml_time_ms();
         LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
-        ggml_tensor *result = ggml_dup_tensor(work_ctx, hidden_states);
+        ggml_tensor* result = ggml_dup_tensor(work_ctx, hidden_states);
         {
             float original_mean = ggml_tensor_mean(hidden_states);
             for (int i2 = 0; i2 < hidden_states->ne[2]; i2++) {
@@ -683,34 +680,34 @@ class StableDiffusionGGML {
             ggml_tensor_scale(result, (original_mean / new_mean));
         }
         if (force_zero_embeddings) {
-            float *vec = (float *) result->data;
+            float* vec = (float*)result->data;
             for (int i = 0; i < ggml_nelements(result); i++) {
                 vec[i] = 0;
             }
         }
 
-        ggml_tensor *vec = NULL;
+        ggml_tensor* vec = NULL;
         if (version == VERSION_XL) {
             int out_dim = 256;
-            vec = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model.adm_in_channels);
+            vec         = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model.adm_in_channels);
             // [0:1280]
             size_t offset = 0;
             memcpy(vec->data, pooled->data, ggml_nbytes(pooled));
             offset += ggml_nbytes(pooled);
 
-            struct ggml_tensor *timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 2);
+            struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 2);
             // original_size_as_tuple
-            float orig_width = (float) width;
-            float orig_height = (float) height;
+            float orig_width  = (float)width;
+            float orig_height = (float)height;
             ggml_tensor_set_f32(timesteps, orig_height, 0);
             ggml_tensor_set_f32(timesteps, orig_width, 1);
-            ggml_tensor *embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim,
+            ggml_tensor* embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim,
                                                    offset);
             offset += ggml_nbytes(embed_view);
             set_timestep_embedding(timesteps, embed_view, out_dim);
             // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
             // crop_coords_top_left
-            float crop_coord_top = 0.f;
+            float crop_coord_top  = 0.f;
             float crop_coord_left = 0.f;
             ggml_tensor_set_f32(timesteps, crop_coord_top, 0);
             ggml_tensor_set_f32(timesteps, crop_coord_left, 1);
@@ -719,8 +716,8 @@ class StableDiffusionGGML {
             set_timestep_embedding(timesteps, embed_view, out_dim);
             // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
             // target_size_as_tuple
-            float target_width = (float) width;
-            float target_height = (float) height;
+            float target_width  = (float)width;
+            float target_height = (float)height;
             ggml_tensor_set_f32(timesteps, target_height, 0);
             ggml_tensor_set_f32(timesteps, target_width, 1);
             embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset);
@@ -733,27 +730,27 @@ class StableDiffusionGGML {
         return {result, vec};
     }
 
-    ggml_tensor *sample(ggml_context *work_ctx,
-                        ggml_tensor *x_t,
-                        ggml_tensor *noise,
-                        ggml_tensor *c,
-                        ggml_tensor *c_vector,
-                        ggml_tensor *uc,
-                        ggml_tensor *uc_vector,
+    ggml_tensor* sample(ggml_context* work_ctx,
+                        ggml_tensor* x_t,
+                        ggml_tensor* noise,
+                        ggml_tensor* c,
+                        ggml_tensor* c_vector,
+                        ggml_tensor* uc,
+                        ggml_tensor* uc_vector,
                         float cfg_scale,
                         sample_method_t method,
-                        const std::vector<float> &sigmas) {
+                        const std::vector<float>& sigmas) {
         size_t steps = sigmas.size() - 1;
         // x_t = load_tensor_from_file(work_ctx, "./rand0.bin");
         // print_ggml_tensor(x_t);
-        struct ggml_tensor *x = ggml_dup_tensor(work_ctx, x_t);
+        struct ggml_tensor* x = ggml_dup_tensor(work_ctx, x_t);
         copy_ggml_tensor(x, x_t);
 
-        struct ggml_tensor *noised_input = ggml_dup_tensor(work_ctx, x_t);
-        struct ggml_tensor *timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32,
-                                                           1);                                     // [N, ]
-        struct ggml_tensor *t_emb = new_timestep_embedding(work_ctx, NULL, timesteps,
-                                                           diffusion_model.model_channels);  // [N, model_channels]
+        struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, x_t);
+        struct ggml_tensor* timesteps    = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32,
+                                                              1);  // [N, ]
+        struct ggml_tensor* t_emb        = new_timestep_embedding(work_ctx, NULL, timesteps,
+                                                                  diffusion_model.model_channels);  // [N, model_channels]
         diffusion_model.alloc_compute_buffer(noised_input, c, t_emb, c_vector);
 
         bool has_unconditioned = cfg_scale != 1.0 && uc != NULL;
@@ -768,31 +765,31 @@ class StableDiffusionGGML {
         }
 
         // denoise wrapper
-        struct ggml_tensor *out_cond = ggml_dup_tensor(work_ctx, x);
-        struct ggml_tensor *out_uncond = NULL;
+        struct ggml_tensor* out_cond   = ggml_dup_tensor(work_ctx, x);
+        struct ggml_tensor* out_uncond = NULL;
         if (has_unconditioned) {
             out_uncond = ggml_dup_tensor(work_ctx, x);
         }
-        struct ggml_tensor *denoised = ggml_dup_tensor(work_ctx, x);
+        struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);
 
-        auto denoise = [&](ggml_tensor *input, float sigma, int step) {
+        auto denoise = [&](ggml_tensor* input, float sigma, int step) {
             if (step == 1) {
-                pretty_progress(0, (int) steps, 0);
+                pretty_progress(0, (int)steps, 0);
             }
             int64_t t0 = ggml_time_us();
 
-            float c_skip = 1.0f;
-            float c_out = 1.0f;
-            float c_in = 1.0f;
+            float c_skip               = 1.0f;
+            float c_out                = 1.0f;
+            float c_in                 = 1.0f;
             std::vector<float> scaling = denoiser->get_scalings(sigma);
 
             if (scaling.size() == 3) {  // CompVisVDenoiser
                 c_skip = scaling[0];
-                c_out = scaling[1];
-                c_in = scaling[2];
+                c_out  = scaling[1];
+                c_in   = scaling[2];
             } else {  // CompVisDenoiser
                 c_out = scaling[0];
-                c_in = scaling[1];
+                c_in  = scaling[1];
             }
 
             float t = denoiser->schedule->sigma_to_t(sigma);
@@ -806,16 +803,16 @@ class StableDiffusionGGML {
             // cond
             diffusion_model.compute(out_cond, n_threads, noised_input, NULL, c, t_emb, c_vector);
 
-            float *negative_data = NULL;
+            float* negative_data = NULL;
             if (has_unconditioned) {
                 // uncond
                 diffusion_model.compute(out_uncond, n_threads, noised_input, NULL, uc, t_emb, uc_vector);
-                negative_data = (float *) out_uncond->data;
+                negative_data = (float*)out_uncond->data;
             }
-            float *vec_denoised = (float *) denoised->data;
-            float *vec_input = (float *) input->data;
-            float *positive_data = (float *) out_cond->data;
-            int ne_elements = (int) ggml_nelements(denoised);
+            float* vec_denoised  = (float*)denoised->data;
+            float* vec_input     = (float*)input->data;
+            float* positive_data = (float*)out_cond->data;
+            int ne_elements      = (int)ggml_nelements(denoised);
             for (int i = 0; i < ne_elements; i++) {
                 float latent_result = positive_data[i];
                 if (has_unconditioned) {
@@ -828,7 +825,7 @@ class StableDiffusionGGML {
             }
             int64_t t1 = ggml_time_us();
             if (step > 0) {
-                pretty_progress(step, (int) steps, (t1 - t0) / 1000000.f);
+                pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f);
                 // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
             }
         };
@@ -836,8 +833,8 @@ class StableDiffusionGGML {
         // sample_euler_ancestral
         switch (method) {
             case EULER_A: {
-                struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, x);
-                struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor* d     = ggml_dup_tensor(work_ctx, x);
 
                 for (int i = 0; i < steps; i++) {
                     float sigma = sigmas[i];
@@ -847,9 +844,9 @@ class StableDiffusionGGML {
 
                     // d = (x - denoised) / sigma
                     {
-                        float *vec_d = (float *) d->data;
-                        float *vec_x = (float *) x->data;
-                        float *vec_denoised = (float *) denoised->data;
+                        float* vec_d        = (float*)d->data;
+                        float* vec_x        = (float*)x->data;
+                        float* vec_denoised = (float*)denoised->data;
 
                         for (int i = 0; i < ggml_nelements(d); i++) {
                             vec_d[i] = (vec_x[i] - vec_denoised[i]) / sigma;
@@ -857,18 +854,18 @@ class StableDiffusionGGML {
                     }
 
                     // get_ancestral_step
-                    float sigma_up = std::min(sigmas[i + 1],
-                                              std::sqrt(sigmas[i + 1] * sigmas[i + 1] *
-                                                        (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) /
-                                                        (sigmas[i] * sigmas[i])));
+                    float sigma_up   = std::min(sigmas[i + 1],
+                                                std::sqrt(sigmas[i + 1] * sigmas[i + 1] *
+                                                          (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) /
+                                                          (sigmas[i] * sigmas[i])));
                     float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up);
 
                     // Euler method
                     float dt = sigma_down - sigmas[i];
                     // x = x + d * dt
                     {
-                        float *vec_d = (float *) d->data;
-                        float *vec_x = (float *) x->data;
+                        float* vec_d = (float*)d->data;
+                        float* vec_x = (float*)x->data;
 
                         for (int i = 0; i < ggml_nelements(x); i++) {
                             vec_x[i] = vec_x[i] + vec_d[i] * dt;
@@ -880,8 +877,8 @@ class StableDiffusionGGML {
                         ggml_tensor_set_f32_randn(noise, rng);
                         // noise = load_tensor_from_file(work_ctx, "./rand" + std::to_string(i+1) + ".bin");
                         {
-                            float *vec_x = (float *) x->data;
-                            float *vec_noise = (float *) noise->data;
+                            float* vec_x     = (float*)x->data;
+                            float* vec_noise = (float*)noise->data;
 
                             for (int i = 0; i < ggml_nelements(x); i++) {
                                 vec_x[i] = vec_x[i] + vec_noise[i] * sigma_up;
@@ -889,11 +886,10 @@ class StableDiffusionGGML {
                         }
                     }
                 }
-            }
-                break;
+            } break;
             case EULER:  // Implemented without any sigma churn
             {
-                struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
 
                 for (int i = 0; i < steps; i++) {
                     float sigma = sigmas[i];
@@ -903,9 +899,9 @@ class StableDiffusionGGML {
 
                     // d = (x - denoised) / sigma
                     {
-                        float *vec_d = (float *) d->data;
-                        float *vec_x = (float *) x->data;
-                        float *vec_denoised = (float *) denoised->data;
+                        float* vec_d        = (float*)d->data;
+                        float* vec_x        = (float*)x->data;
+                        float* vec_denoised = (float*)denoised->data;
 
                         for (int j = 0; j < ggml_nelements(d); j++) {
                             vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigma;
@@ -915,19 +911,18 @@ class StableDiffusionGGML {
                     float dt = sigmas[i + 1] - sigma;
                     // x = x + d * dt
                     {
-                        float *vec_d = (float *) d->data;
-                        float *vec_x = (float *) x->data;
+                        float* vec_d = (float*)d->data;
+                        float* vec_x = (float*)x->data;
 
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             vec_x[j] = vec_x[j] + vec_d[j] * dt;
                         }
                     }
                 }
-            }
-                break;
+            } break;
             case HEUN: {
-                struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x);
-                struct ggml_tensor *x2 = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor* d  = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x);
 
                 for (int i = 0; i < steps; i++) {
                     // denoise
@@ -935,9 +930,9 @@ class StableDiffusionGGML {
 
                     // d = (x - denoised) / sigma
                     {
-                        float *vec_d = (float *) d->data;
-                        float *vec_x = (float *) x->data;
-                        float *vec_denoised = (float *) denoised->data;
+                        float* vec_d        = (float*)d->data;
+                        float* vec_x        = (float*)x->data;
+                        float* vec_denoised = (float*)denoised->data;
 
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i];
@@ -948,25 +943,25 @@ class StableDiffusionGGML {
                     if (sigmas[i + 1] == 0) {
                         // Euler step
                         // x = x + d * dt
-                        float *vec_d = (float *) d->data;
-                        float *vec_x = (float *) x->data;
+                        float* vec_d = (float*)d->data;
+                        float* vec_x = (float*)x->data;
 
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             vec_x[j] = vec_x[j] + vec_d[j] * dt;
                         }
                     } else {
                         // Heun step
-                        float *vec_d = (float *) d->data;
-                        float *vec_d2 = (float *) d->data;
-                        float *vec_x = (float *) x->data;
-                        float *vec_x2 = (float *) x2->data;
+                        float* vec_d  = (float*)d->data;
+                        float* vec_d2 = (float*)d->data;
+                        float* vec_x  = (float*)x->data;
+                        float* vec_x2 = (float*)x2->data;
 
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             vec_x2[j] = vec_x[j] + vec_d[j] * dt;
                         }
 
                         denoise(x2, sigmas[i + 1], i + 1);
-                        float *vec_denoised = (float *) denoised->data;
+                        float* vec_denoised = (float*)denoised->data;
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             float d2 = (vec_x2[j] - vec_denoised[j]) / sigmas[i + 1];
                             vec_d[j] = (vec_d[j] + d2) / 2;
@@ -974,11 +969,10 @@ class StableDiffusionGGML {
                         }
                     }
                 }
-            }
-                break;
+            } break;
             case DPM2: {
-                struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x);
-                struct ggml_tensor *x2 = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor* d  = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x);
 
                 for (int i = 0; i < steps; i++) {
                     // denoise
@@ -986,9 +980,9 @@ class StableDiffusionGGML {
 
                     // d = (x - denoised) / sigma
                     {
-                        float *vec_d = (float *) d->data;
-                        float *vec_x = (float *) x->data;
-                        float *vec_denoised = (float *) denoised->data;
+                        float* vec_d        = (float*)d->data;
+                        float* vec_x        = (float*)x->data;
+                        float* vec_denoised = (float*)denoised->data;
 
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i];
@@ -998,9 +992,9 @@ class StableDiffusionGGML {
                     if (sigmas[i + 1] == 0) {
                         // Euler step
                         // x = x + d * dt
-                        float dt = sigmas[i + 1] - sigmas[i];
-                        float *vec_d = (float *) d->data;
-                        float *vec_x = (float *) x->data;
+                        float dt     = sigmas[i + 1] - sigmas[i];
+                        float* vec_d = (float*)d->data;
+                        float* vec_x = (float*)x->data;
 
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             vec_x[j] = vec_x[j] + vec_d[j] * dt;
@@ -1008,18 +1002,18 @@ class StableDiffusionGGML {
                     } else {
                         // DPM-Solver-2
                         float sigma_mid = exp(0.5f * (log(sigmas[i]) + log(sigmas[i + 1])));
-                        float dt_1 = sigma_mid - sigmas[i];
-                        float dt_2 = sigmas[i + 1] - sigmas[i];
+                        float dt_1      = sigma_mid - sigmas[i];
+                        float dt_2      = sigmas[i + 1] - sigmas[i];
 
-                        float *vec_d = (float *) d->data;
-                        float *vec_x = (float *) x->data;
-                        float *vec_x2 = (float *) x2->data;
+                        float* vec_d  = (float*)d->data;
+                        float* vec_x  = (float*)x->data;
+                        float* vec_x2 = (float*)x2->data;
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             vec_x2[j] = vec_x[j] + vec_d[j] * dt_1;
                         }
 
                         denoise(x2, sigma_mid, i + 1);
-                        float *vec_denoised = (float *) denoised->data;
+                        float* vec_denoised = (float*)denoised->data;
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             float d2 = (vec_x2[j] - vec_denoised[j]) / sigma_mid;
                             vec_x[j] = vec_x[j] + d2 * dt_2;
@@ -1027,31 +1021,30 @@ class StableDiffusionGGML {
                     }
                 }
 
-            }
-                break;
+            } break;
             case DPMPP2S_A: {
-                struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, x);
-                struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x);
-                struct ggml_tensor *x2 = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor* d     = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor* x2    = ggml_dup_tensor(work_ctx, x);
 
                 for (int i = 0; i < steps; i++) {
                     // denoise
                     denoise(x, sigmas[i], i + 1);
 
                     // get_ancestral_step
-                    float sigma_up = std::min(sigmas[i + 1],
-                                              std::sqrt(sigmas[i + 1] * sigmas[i + 1] *
-                                                        (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) /
-                                                        (sigmas[i] * sigmas[i])));
+                    float sigma_up   = std::min(sigmas[i + 1],
+                                                std::sqrt(sigmas[i + 1] * sigmas[i + 1] *
+                                                          (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) /
+                                                          (sigmas[i] * sigmas[i])));
                     float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up);
-                    auto t_fn = [](float sigma) -> float { return -log(sigma); };
-                    auto sigma_fn = [](float t) -> float { return exp(-t); };
+                    auto t_fn        = [](float sigma) -> float { return -log(sigma); };
+                    auto sigma_fn    = [](float t) -> float { return exp(-t); };
 
                     if (sigma_down == 0) {
                         // Euler step
-                        float *vec_d = (float *) d->data;
-                        float *vec_x = (float *) x->data;
-                        float *vec_denoised = (float *) denoised->data;
+                        float* vec_d        = (float*)d->data;
+                        float* vec_x        = (float*)x->data;
+                        float* vec_denoised = (float*)denoised->data;
 
                         for (int j = 0; j < ggml_nelements(d); j++) {
                             vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i];
@@ -1067,15 +1060,15 @@ class StableDiffusionGGML {
                         }
                     } else {
                         // DPM-Solver++(2S)
-                        float t = t_fn(sigmas[i]);
+                        float t      = t_fn(sigmas[i]);
                         float t_next = t_fn(sigma_down);
-                        float h = t_next - t;
-                        float s = t + 0.5f * h;
+                        float h      = t_next - t;
+                        float s      = t + 0.5f * h;
 
-                        float *vec_d = (float *) d->data;
-                        float *vec_x = (float *) x->data;
-                        float *vec_x2 = (float *) x2->data;
-                        float *vec_denoised = (float *) denoised->data;
+                        float* vec_d        = (float*)d->data;
+                        float* vec_x        = (float*)x->data;
+                        float* vec_x2       = (float*)x2->data;
+                        float* vec_denoised = (float*)denoised->data;
 
                         // First half-step
                         for (int j = 0; j < ggml_nelements(x); j++) {
@@ -1094,8 +1087,8 @@ class StableDiffusionGGML {
                     if (sigmas[i + 1] > 0) {
                         ggml_tensor_set_f32_randn(noise, rng);
                         {
-                            float *vec_x = (float *) x->data;
-                            float *vec_noise = (float *) noise->data;
+                            float* vec_x     = (float*)x->data;
+                            float* vec_noise = (float*)noise->data;
 
                             for (int i = 0; i < ggml_nelements(x); i++) {
                                 vec_x[i] = vec_x[i] + vec_noise[i] * sigma_up;
@@ -1103,11 +1096,10 @@ class StableDiffusionGGML {
                         }
                     }
                 }
-            }
-                break;
+            } break;
             case DPMPP2M:  // DPM++ (2M) from Karras et al (2022)
             {
-                struct ggml_tensor *old_denoised = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x);
 
                 auto t_fn = [](float sigma) -> float { return -log(sigma); };
 
@@ -1115,14 +1107,14 @@ class StableDiffusionGGML {
                     // denoise
                     denoise(x, sigmas[i], i + 1);
 
-                    float t = t_fn(sigmas[i]);
-                    float t_next = t_fn(sigmas[i + 1]);
-                    float h = t_next - t;
-                    float a = sigmas[i + 1] / sigmas[i];
-                    float b = exp(-h) - 1.f;
-                    float *vec_x = (float *) x->data;
-                    float *vec_denoised = (float *) denoised->data;
-                    float *vec_old_denoised = (float *) old_denoised->data;
+                    float t                 = t_fn(sigmas[i]);
+                    float t_next            = t_fn(sigmas[i + 1]);
+                    float h                 = t_next - t;
+                    float a                 = sigmas[i + 1] / sigmas[i];
+                    float b                 = exp(-h) - 1.f;
+                    float* vec_x            = (float*)x->data;
+                    float* vec_denoised     = (float*)denoised->data;
+                    float* vec_old_denoised = (float*)old_denoised->data;
 
                     if (i == 0 || sigmas[i + 1] == 0) {
                         // Simpler step for the edge cases
@@ -1131,10 +1123,10 @@ class StableDiffusionGGML {
                         }
                     } else {
                         float h_last = t - t_fn(sigmas[i - 1]);
-                        float r = h_last / h;
+                        float r      = h_last / h;
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             float denoised_d =
-                                    (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j];
+                                (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j];
                             vec_x[j] = a * vec_x[j] - b * denoised_d;
                         }
                     }
@@ -1144,11 +1136,10 @@ class StableDiffusionGGML {
                         vec_old_denoised[j] = vec_denoised[j];
                     }
                 }
-            }
-                break;
+            } break;
             case DPMPP2Mv2:  // Modified DPM++ (2M) from https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457
             {
-                struct ggml_tensor *old_denoised = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x);
 
                 auto t_fn = [](float sigma) -> float { return -log(sigma); };
 
@@ -1156,13 +1147,13 @@ class StableDiffusionGGML {
                     // denoise
                     denoise(x, sigmas[i], i + 1);
 
-                    float t = t_fn(sigmas[i]);
-                    float t_next = t_fn(sigmas[i + 1]);
-                    float h = t_next - t;
-                    float a = sigmas[i + 1] / sigmas[i];
-                    float *vec_x = (float *) x->data;
-                    float *vec_denoised = (float *) denoised->data;
-                    float *vec_old_denoised = (float *) old_denoised->data;
+                    float t                 = t_fn(sigmas[i]);
+                    float t_next            = t_fn(sigmas[i + 1]);
+                    float h                 = t_next - t;
+                    float a                 = sigmas[i + 1] / sigmas[i];
+                    float* vec_x            = (float*)x->data;
+                    float* vec_denoised     = (float*)denoised->data;
+                    float* vec_old_denoised = (float*)old_denoised->data;
 
                     if (i == 0 || sigmas[i + 1] == 0) {
                         // Simpler step for the edge cases
@@ -1172,14 +1163,14 @@ class StableDiffusionGGML {
                         }
                     } else {
                         float h_last = t - t_fn(sigmas[i - 1]);
-                        float h_min = std::min(h_last, h);
-                        float h_max = std::max(h_last, h);
-                        float r = h_max / h_min;
-                        float h_d = (h_max + h_min) / 2.f;
-                        float b = exp(-h_d) - 1.f;
+                        float h_min  = std::min(h_last, h);
+                        float h_max  = std::max(h_last, h);
+                        float r      = h_max / h_min;
+                        float h_d    = (h_max + h_min) / 2.f;
+                        float b      = exp(-h_d) - 1.f;
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             float denoised_d =
-                                    (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j];
+                                (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j];
                             vec_x[j] = a * vec_x[j] - b * denoised_d;
                         }
                     }
@@ -1189,12 +1180,11 @@ class StableDiffusionGGML {
                         vec_old_denoised[j] = vec_denoised[j];
                     }
                 }
-            }
-                break;
+            } break;
             case LCM:  // Latent Consistency Models
             {
-                struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, x);
-                struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor* d     = ggml_dup_tensor(work_ctx, x);
 
                 for (int i = 0; i < steps; i++) {
                     float sigma = sigmas[i];
@@ -1204,8 +1194,8 @@ class StableDiffusionGGML {
 
                     // x = denoised
                     {
-                        float *vec_x = (float *) x->data;
-                        float *vec_denoised = (float *) denoised->data;
+                        float* vec_x        = (float*)x->data;
+                        float* vec_denoised = (float*)denoised->data;
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             vec_x[j] = vec_denoised[j];
                         }
@@ -1216,8 +1206,8 @@ class StableDiffusionGGML {
                         ggml_tensor_set_f32_randn(noise, rng);
                         // noise = load_tensor_from_file(res_ctx, "./rand" + std::to_string(i+1) + ".bin");
                         {
-                            float *vec_x = (float *) x->data;
-                            float *vec_noise = (float *) noise->data;
+                            float* vec_x     = (float*)x->data;
+                            float* vec_noise = (float*)noise->data;
 
                             for (int j = 0; j < ggml_nelements(x); j++) {
                                 vec_x[j] = vec_x[j] + sigmas[i + 1] * vec_noise[j];
@@ -1225,8 +1215,7 @@ class StableDiffusionGGML {
                         }
                     }
                 }
-            }
-                break;
+            } break;
 
             default:
                 LOG_ERROR("Attempting to sample with nonexisting sample method %i", method);
@@ -1237,28 +1226,28 @@ class StableDiffusionGGML {
     }
 
     // ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding
-    ggml_tensor *get_first_stage_encoding(ggml_context *work_ctx, ggml_tensor *moments) {
+    ggml_tensor* get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) {
         // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample
-        ggml_tensor *latent = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1],
-                                                 moments->ne[2] / 2, moments->ne[3]);
-        struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, latent);
+        ggml_tensor* latent       = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1],
+                                                       moments->ne[2] / 2, moments->ne[3]);
+        struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latent);
         ggml_tensor_set_f32_randn(noise, rng);
         // noise = load_tensor_from_file(work_ctx, "noise.bin");
         {
-            float mean = 0;
+            float mean   = 0;
             float logvar = 0;
-            float value = 0;
-            float std_ = 0;
+            float value  = 0;
+            float std_   = 0;
             for (int i = 0; i < latent->ne[3]; i++) {
                 for (int j = 0; j < latent->ne[2]; j++) {
                     for (int k = 0; k < latent->ne[1]; k++) {
                         for (int l = 0; l < latent->ne[0]; l++) {
-                            mean = ggml_tensor_get_f32(moments, l, k, j, i);
-                            logvar = ggml_tensor_get_f32(moments, l, k, j + (int) latent->ne[2], i);
+                            mean   = ggml_tensor_get_f32(moments, l, k, j, i);
+                            logvar = ggml_tensor_get_f32(moments, l, k, j + (int)latent->ne[2], i);
                             logvar = std::max(-30.0f, std::min(logvar, 20.0f));
-                            std_ = std::exp(0.5f * logvar);
-                            value = mean + std_ * ggml_tensor_get_f32(noise, l, k, j, i);
-                            value = value * scale_factor;
+                            std_   = std::exp(0.5f * logvar);
+                            value  = mean + std_ * ggml_tensor_get_f32(noise, l, k, j, i);
+                            value  = value * scale_factor;
                             // printf("%d %d %d %d -> %f\n", i, j, k, l, value);
                             ggml_tensor_set_f32(latent, value, l, k, j, i);
                         }
@@ -1269,14 +1258,14 @@ class StableDiffusionGGML {
         return latent;
     }
 
-    ggml_tensor *compute_first_stage(ggml_context *work_ctx, ggml_tensor *x, bool decode) {
-        int64_t W = x->ne[0];
-        int64_t H = x->ne[1];
-        ggml_tensor *result = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32,
+    ggml_tensor* compute_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode) {
+        int64_t W           = x->ne[0];
+        int64_t H           = x->ne[1];
+        ggml_tensor* result = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32,
                                                  decode ? (W * 8) : (W / 8),                    // width
                                                  decode ? (H * 8) : (H / 8),                    // height
                                                  decode ? 3 : (use_tiny_autoencoder ? 4 : 8));  // channels
-        int64_t t0 = ggml_time_ms();
+        int64_t t0          = ggml_time_ms();
         if (!use_tiny_autoencoder) {
             if (decode) {
                 ggml_tensor_scale(x, 1.0f / scale_factor);
@@ -1285,7 +1274,7 @@ class StableDiffusionGGML {
             }
             if (vae_tiling && decode) {  // TODO: support tiling vae encode
                 // split latent in 32x32 tiles and compute in several steps
-                auto on_tiling = [&](ggml_tensor *in, ggml_tensor *out, bool init) {
+                auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
                     if (init) {
                         first_stage_model.alloc_compute_buffer(in, decode);
                     } else {
@@ -1304,7 +1293,7 @@ class StableDiffusionGGML {
         } else {
             if (vae_tiling && decode) {  // TODO: support tiling vae encode
                 // split latent in 64x64 tiles and compute in several steps
-                auto on_tiling = [&](ggml_tensor *in, ggml_tensor *out, bool init) {
+                auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
                     if (init) {
                         tae_first_stage.alloc_compute_buffer(in, decode);
                     } else {
@@ -1327,11 +1316,11 @@ class StableDiffusionGGML {
         return result;
     }
 
-    ggml_tensor *encode_first_stage(ggml_context *work_ctx, ggml_tensor *x) {
+    ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x) {
         return compute_first_stage(work_ctx, x, false);
     }
 
-    ggml_tensor *decode_first_stage(ggml_context *work_ctx, ggml_tensor *x) {
+    ggml_tensor* decode_first_stage(ggml_context* work_ctx, ggml_tensor* x) {
         return compute_first_stage(work_ctx, x, true);
     }
 };
@@ -1339,19 +1328,19 @@ class StableDiffusionGGML {
 /*================================================= SD API ==================================================*/
 
 struct sd_ctx_t {
-    StableDiffusionGGML *sd = NULL;
+    StableDiffusionGGML* sd = NULL;
 };
 
-sd_ctx_t *new_sd_ctx(int n_threads,
+sd_ctx_t* new_sd_ctx(int n_threads,
                      bool vae_decode_only,
                      bool free_params_immediately,
-                     const char *lora_model_dir_c_str,
+                     const char* lora_model_dir_c_str,
                      enum rng_type_t rng_type,
                      bool vae_tiling,
                      enum sd_type_t wtype,
                      enum schedule_t s,
                      bool init_backend_immediately) {
-    sd_ctx_t *sd_ctx = (sd_ctx_t *) malloc(sizeof(sd_ctx_t));
+    sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t));
     if (sd_ctx == NULL) {
         return NULL;
     }
@@ -1365,12 +1354,11 @@ sd_ctx_t *new_sd_ctx(int n_threads,
                                          vae_tiling,
                                          static_cast<ggml_type>(wtype),
                                          s,
-                                         init_backend_immediately
-    );
+                                         init_backend_immediately);
     return sd_ctx;
 }
 
-void free_sd_ctx(sd_ctx_t *sd_ctx) {
+void free_sd_ctx(sd_ctx_t* sd_ctx) {
     if (sd_ctx->sd != NULL) {
         delete sd_ctx->sd;
         sd_ctx->sd = NULL;
@@ -1378,7 +1366,7 @@ void free_sd_ctx(sd_ctx_t *sd_ctx) {
     free(sd_ctx);
 }
 
-void init_backend(sd_ctx_t *sd_ctx) {
+void init_backend(sd_ctx_t* sd_ctx) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return;
@@ -1386,41 +1374,39 @@ void init_backend(sd_ctx_t *sd_ctx) {
     sd_ctx->sd->init_backend();
 }
 
-void set_options(sd_ctx_t *sd_ctx,
+void set_options(sd_ctx_t* sd_ctx,
                  int n_threads,
                  bool vae_decode_only,
                  bool free_params_immediately,
-                 const char *lora_model_dir,
+                 const char* lora_model_dir,
                  rng_type_t rng_type,
                  bool vae_tiling,
                  sd_type_t wtype,
-                 schedule_t schedule
-) {
+                 schedule_t schedule) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return;
     }
     sd_ctx->sd->set_options(
-            n_threads,
-            vae_decode_only,
-            free_params_immediately,
-            std::string(lora_model_dir),
-            rng_type,
-            vae_tiling,
-            wtype,
-            schedule
-    );
+        n_threads,
+        vae_decode_only,
+        free_params_immediately,
+        std::string(lora_model_dir),
+        rng_type,
+        vae_tiling,
+        wtype,
+        schedule);
 }
 
-bool load_clip_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix) {
+bool load_clip_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* prefix) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return false;
     }
-    return sd_ctx->sd->load_clip_from_file(std::string(model_path), true,std::string(prefix));
+    return sd_ctx->sd->load_clip_from_file(std::string(model_path), true, std::string(prefix));
 }
 
-void free_clip_params(sd_ctx_t *sd_ctx) {
+void free_clip_params(sd_ctx_t* sd_ctx) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return;
@@ -1428,7 +1414,7 @@ void free_clip_params(sd_ctx_t *sd_ctx) {
     sd_ctx->sd->free_clip_params();
 }
 
-bool load_unet_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix) {
+bool load_unet_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* prefix) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return false;
@@ -1436,7 +1422,7 @@ bool load_unet_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *p
     return sd_ctx->sd->load_unet_from_file(std::string(model_path), true, std::string(prefix));
 }
 
-void free_unet_params(sd_ctx_t *sd_ctx) {
+void free_unet_params(sd_ctx_t* sd_ctx) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return;
@@ -1444,7 +1430,7 @@ void free_unet_params(sd_ctx_t *sd_ctx) {
     sd_ctx->sd->free_unet_params();
 }
 
-bool load_vae_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix) {
+bool load_vae_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* prefix) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return false;
@@ -1452,7 +1438,7 @@ bool load_vae_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *pr
     return sd_ctx->sd->load_vae_from_file(std::string(model_path), true, std::string(prefix));
 }
 
-void free_vae_params(sd_ctx_t *sd_ctx) {
+void free_vae_params(sd_ctx_t* sd_ctx) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return;
@@ -1460,7 +1446,7 @@ void free_vae_params(sd_ctx_t *sd_ctx) {
     sd_ctx->sd->free_vae_params();
 }
 
-bool load_taesd_from_file(sd_ctx_t *sd_ctx, const char *model_path) {
+bool load_taesd_from_file(sd_ctx_t* sd_ctx, const char* model_path) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return false;
@@ -1468,7 +1454,7 @@ bool load_taesd_from_file(sd_ctx_t *sd_ctx, const char *model_path) {
     return sd_ctx->sd->load_taesd_from_file(std::string(model_path));
 }
 
-void free_taesd_params(sd_ctx_t *sd_ctx) {
+void free_taesd_params(sd_ctx_t* sd_ctx) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return;
@@ -1477,7 +1463,7 @@ void free_taesd_params(sd_ctx_t *sd_ctx) {
 }
 
 // load all model from one file
-bool load_diffusions_from_file(sd_ctx_t *sd_ctx, const char *model_path) {
+bool load_diffusions_from_file(sd_ctx_t* sd_ctx, const char* model_path) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return false;
@@ -1486,7 +1472,7 @@ bool load_diffusions_from_file(sd_ctx_t *sd_ctx, const char *model_path) {
 }
 
 // free all model from one file
-void free_diffusions_params(sd_ctx_t *sd_ctx) {
+void free_diffusions_params(sd_ctx_t* sd_ctx) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return;
@@ -1494,9 +1480,9 @@ void free_diffusions_params(sd_ctx_t *sd_ctx) {
     return sd_ctx->sd->free_diffusions_params();
 }
 
-sd_image_t *txt2img(sd_ctx_t *sd_ctx,
-                    const char *prompt_c_str,
-                    const char *negative_prompt_c_str,
+sd_image_t* txt2img(sd_ctx_t* sd_ctx,
+                    const char* prompt_c_str,
+                    const char* negative_prompt_c_str,
                     int clip_skip,
                     float cfg_scale,
                     int width,
@@ -1514,10 +1500,10 @@ sd_image_t *txt2img(sd_ctx_t *sd_ctx,
     std::string negative_prompt(negative_prompt_c_str);
 
     // extract and remove lora
-    auto result_pair = extract_and_remove_lora(prompt);
+    auto result_pair                                = extract_and_remove_lora(prompt);
     std::unordered_map<std::string, float> lora_f2m = result_pair.first;  // lora_name -> multiplier
 
-    for (auto &kv: lora_f2m) {
+    for (auto& kv : lora_f2m) {
         LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second);
     }
 
@@ -1533,10 +1519,10 @@ sd_image_t *txt2img(sd_ctx_t *sd_ctx,
     params.mem_size += width * height * 3 * sizeof(float);
     params.mem_size *= batch_count;
     params.mem_buffer = NULL;
-    params.no_alloc = false;
+    params.no_alloc   = false;
     // LOG_DEBUG("mem_size %u ", params.mem_size);
 
-    struct ggml_context *work_ctx = ggml_init(params);
+    struct ggml_context* work_ctx = ggml_init(params);
     if (!work_ctx) {
         LOG_ERROR("ggml_init() failed");
         return NULL;
@@ -1546,16 +1532,16 @@ sd_image_t *txt2img(sd_ctx_t *sd_ctx,
         // Generally, when using the provided command line, the seed is always >0.
         // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
         // by a third party with a seed <0, let's incorporate randomization here.
-        srand((int) time(NULL));
+        srand((int)time(NULL));
         seed = rand();
     }
 
-    t0 = ggml_time_ms();
-    auto cond_pair = sd_ctx->sd->get_learned_condition(work_ctx, prompt, clip_skip, width, height);
-    ggml_tensor *c = cond_pair.first;
-    ggml_tensor *c_vector = cond_pair.second;  // [adm_in_channels, ]
-    struct ggml_tensor *uc = NULL;
-    struct ggml_tensor *uc_vector = NULL;
+    t0                            = ggml_time_ms();
+    auto cond_pair                = sd_ctx->sd->get_learned_condition(work_ctx, prompt, clip_skip, width, height);
+    ggml_tensor* c                = cond_pair.first;
+    ggml_tensor* c_vector         = cond_pair.second;  // [adm_in_channels, ]
+    struct ggml_tensor* uc        = NULL;
+    struct ggml_tensor* uc_vector = NULL;
     if (cfg_scale != 1.0) {
         bool force_zero_embeddings = false;
         if (sd_ctx->sd->version == VERSION_XL && negative_prompt.size() == 0) {
@@ -1563,8 +1549,8 @@ sd_image_t *txt2img(sd_ctx_t *sd_ctx,
         }
         auto uncond_pair = sd_ctx->sd->get_learned_condition(work_ctx, negative_prompt, clip_skip, width, height,
                                                              force_zero_embeddings);
-        uc = uncond_pair.first;
-        uc_vector = uncond_pair.second;  // [adm_in_channels, ]
+        uc               = uncond_pair.first;
+        uc_vector        = uncond_pair.second;  // [adm_in_channels, ]
     }
     t1 = ggml_time_ms();
     LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t1 - t0);
@@ -1573,23 +1559,23 @@ sd_image_t *txt2img(sd_ctx_t *sd_ctx,
         sd_ctx->sd->cond_stage_model.free_params_buffer();
     }
 
-    std::vector<struct ggml_tensor *> final_latents;  // collect latents to decode
+    std::vector<struct ggml_tensor*> final_latents;  // collect latents to decode
     int C = 4;
     int W = width / 8;
     int H = height / 8;
     LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
     for (int b = 0; b < batch_count; b++) {
         int64_t sampling_start = ggml_time_ms();
-        int64_t cur_seed = seed + b;
+        int64_t cur_seed       = seed + b;
         LOG_INFO("generating image: %i/%i - seed %i", b + 1, batch_count, cur_seed);
 
         sd_ctx->sd->rng->manual_seed(cur_seed);
-        struct ggml_tensor *x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
+        struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
         ggml_tensor_set_f32_randn(x_t, sd_ctx->sd->rng);
 
         std::vector<float> sigmas = sd_ctx->sd->denoiser->schedule->get_sigmas(sample_steps);
 
-        struct ggml_tensor *x_0 = sd_ctx->sd->sample(work_ctx, x_t, NULL, c, c_vector, uc, uc_vector, cfg_scale,
+        struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx, x_t, NULL, c, c_vector, uc, uc_vector, cfg_scale,
                                                      sample_method, sigmas);
         // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
         // print_ggml_tensor(x_0);
@@ -1606,10 +1592,10 @@ sd_image_t *txt2img(sd_ctx_t *sd_ctx,
              (t3 - t1) * 1.0f / 1000);
 
     LOG_INFO("decoding %zu latents", final_latents.size());
-    std::vector<struct ggml_tensor *> decoded_images;  // collect decoded images
+    std::vector<struct ggml_tensor*> decoded_images;  // collect decoded images
     for (size_t i = 0; i < final_latents.size(); i++) {
-        t1 = ggml_time_ms();
-        struct ggml_tensor *img = sd_ctx->sd->decode_first_stage(work_ctx, final_latents[i] /* x_0 */);
+        t1                      = ggml_time_ms();
+        struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, final_latents[i] /* x_0 */);
         // print_ggml_tensor(img);
         if (img != NULL) {
             decoded_images.push_back(img);
@@ -1623,30 +1609,30 @@ sd_image_t *txt2img(sd_ctx_t *sd_ctx,
     if (sd_ctx->sd->free_params_immediately && !sd_ctx->sd->use_tiny_autoencoder) {
         sd_ctx->sd->first_stage_model.free_params_buffer();
     }
-    sd_image_t *result_images = (sd_image_t *) calloc(batch_count, sizeof(sd_image_t));
+    sd_image_t* result_images = (sd_image_t*)calloc(batch_count, sizeof(sd_image_t));
     if (result_images == NULL) {
         ggml_free(work_ctx);
         return NULL;
     }
 
     for (size_t i = 0; i < decoded_images.size(); i++) {
-        result_images[i].width = width;
-        result_images[i].height = height;
+        result_images[i].width   = width;
+        result_images[i].height  = height;
         result_images[i].channel = 3;
-        result_images[i].data = sd_tensor_to_image(decoded_images[i]);
+        result_images[i].data    = sd_tensor_to_image(decoded_images[i]);
     }
     ggml_free(work_ctx);
     LOG_INFO(
-            "txt2img completed in %.2fs",
-            (t4 - t0) * 1.0f / 1000);
+        "txt2img completed in %.2fs",
+        (t4 - t0) * 1.0f / 1000);
 
     return result_images;
 }
 
-sd_image_t *img2img(sd_ctx_t *sd_ctx,
+sd_image_t* img2img(sd_ctx_t* sd_ctx,
                     sd_image_t init_image,
-                    const char *prompt_c_str,
-                    const char *negative_prompt_c_str,
+                    const char* prompt_c_str,
+                    const char* negative_prompt_c_str,
                     int clip_skip,
                     float cfg_scale,
                     int width,
@@ -1665,7 +1651,7 @@ sd_image_t *img2img(sd_ctx_t *sd_ctx,
     LOG_INFO("img2img %dx%d", width, height);
 
     std::vector<float> sigmas = sd_ctx->sd->denoiser->schedule->get_sigmas(sample_steps);
-    size_t t_enc = static_cast<size_t>(sample_steps * strength);
+    size_t t_enc              = static_cast<size_t>(sample_steps * strength);
     LOG_INFO("target t_enc is %zu steps", t_enc);
     std::vector<float> sigma_sched;
     sigma_sched.assign(sigmas.begin() + sample_steps - t_enc - 1, sigmas.end());
@@ -1674,26 +1660,26 @@ sd_image_t *img2img(sd_ctx_t *sd_ctx,
     params.mem_size = static_cast<size_t>(10 * 1024) * 1024;  // 10 MB
     params.mem_size += width * height * 3 * sizeof(float) * 2;
     params.mem_buffer = NULL;
-    params.no_alloc = false;
+    params.no_alloc   = false;
     // LOG_DEBUG("mem_size %u ", params.mem_size);
 
     // draft context
-    struct ggml_context *work_ctx = ggml_init(params);
+    struct ggml_context* work_ctx = ggml_init(params);
     if (!work_ctx) {
         LOG_ERROR("ggml_init() failed");
         return NULL;
     }
 
     if (seed < 0) {
-        seed = (int) time(NULL);
+        seed = (int)time(NULL);
     }
 
     sd_ctx->sd->rng->manual_seed(seed);
 
     // extract and remove lora
-    auto result_pair = extract_and_remove_lora(prompt);
+    auto result_pair                                = extract_and_remove_lora(prompt);
     std::unordered_map<std::string, float> lora_f2m = result_pair.first;  // lora_name -> multiplier
-    for (auto &kv: lora_f2m) {
+    for (auto& kv : lora_f2m) {
         LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second);
     }
     prompt = result_pair.second;
@@ -1705,13 +1691,13 @@ sd_image_t *img2img(sd_ctx_t *sd_ctx,
     int64_t t1 = ggml_time_ms();
     LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
 
-    ggml_tensor *init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
+    ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
     sd_image_to_tensor(init_image.data, init_img);
-    t0 = ggml_time_ms();
-    ggml_tensor *init_latent = NULL;
+    t0                       = ggml_time_ms();
+    ggml_tensor* init_latent = NULL;
     if (!sd_ctx->sd->use_tiny_autoencoder) {
-        ggml_tensor *moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
-        init_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
+        ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
+        init_latent          = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
     } else {
         init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
     }
@@ -1719,11 +1705,11 @@ sd_image_t *img2img(sd_ctx_t *sd_ctx,
     t1 = ggml_time_ms();
     LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
 
-    auto cond_pair = sd_ctx->sd->get_learned_condition(work_ctx, prompt, clip_skip, width, height);
-    ggml_tensor *c = cond_pair.first;
-    ggml_tensor *c_vector = cond_pair.second;  // [adm_in_channels, ]
-    struct ggml_tensor *uc = NULL;
-    struct ggml_tensor *uc_vector = NULL;
+    auto cond_pair                = sd_ctx->sd->get_learned_condition(work_ctx, prompt, clip_skip, width, height);
+    ggml_tensor* c                = cond_pair.first;
+    ggml_tensor* c_vector         = cond_pair.second;  // [adm_in_channels, ]
+    struct ggml_tensor* uc        = NULL;
+    struct ggml_tensor* uc_vector = NULL;
     if (cfg_scale != 1.0) {
         bool force_zero_embeddings = false;
         if (sd_ctx->sd->version == VERSION_XL && negative_prompt.size() == 0) {
@@ -1731,8 +1717,8 @@ sd_image_t *img2img(sd_ctx_t *sd_ctx,
         }
         auto uncond_pair = sd_ctx->sd->get_learned_condition(work_ctx, negative_prompt, clip_skip, width, height,
                                                              force_zero_embeddings);
-        uc = uncond_pair.first;
-        uc_vector = uncond_pair.second;  // [adm_in_channels, ]
+        uc               = uncond_pair.first;
+        uc_vector        = uncond_pair.second;  // [adm_in_channels, ]
     }
     int64_t t2 = ggml_time_ms();
     LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t2 - t1);
@@ -1741,11 +1727,11 @@ sd_image_t *img2img(sd_ctx_t *sd_ctx,
     }
 
     sd_ctx->sd->rng->manual_seed(seed);
-    struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, init_latent);
+    struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, init_latent);
     ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng);
 
     LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
-    struct ggml_tensor *x_0 = sd_ctx->sd->sample(work_ctx, init_latent, noise, c, c_vector, uc, uc_vector,
+    struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx, init_latent, noise, c, c_vector, uc, uc_vector,
                                                  cfg_scale, sample_method, sigma_sched);
     // struct ggml_tensor *x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
     // print_ggml_tensor(x_0);
@@ -1755,7 +1741,7 @@ sd_image_t *img2img(sd_ctx_t *sd_ctx,
         sd_ctx->sd->diffusion_model.free_params_buffer();
     }
 
-    struct ggml_tensor *img = sd_ctx->sd->decode_first_stage(work_ctx, x_0);
+    struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, x_0);
     if (sd_ctx->sd->free_params_immediately && !sd_ctx->sd->use_tiny_autoencoder) {
         sd_ctx->sd->first_stage_model.free_params_buffer();
     }
@@ -1764,17 +1750,17 @@ sd_image_t *img2img(sd_ctx_t *sd_ctx,
         return NULL;
     }
 
-    sd_image_t *result_images = (sd_image_t *) calloc(1, sizeof(sd_image_t));
+    sd_image_t* result_images = (sd_image_t*)calloc(1, sizeof(sd_image_t));
     if (result_images == NULL) {
         ggml_free(work_ctx);
         return NULL;
     }
 
     for (size_t i = 0; i < 1; i++) {
-        result_images[i].width = width;
-        result_images[i].height = height;
+        result_images[i].width   = width;
+        result_images[i].height  = height;
         result_images[i].channel = 3;
-        result_images[i].data = sd_tensor_to_image(img);
+        result_images[i].data    = sd_tensor_to_image(img);
     }
     ggml_free(work_ctx);
 
diff --git a/stable-diffusion.h b/stable-diffusion.h
index 0d59dce4..5a12543a 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -54,8 +54,8 @@ enum schedule_t {
 
 // same as enum ggml_type
 enum sd_type_t {
-    SD_TYPE_F32 = 0,
-    SD_TYPE_F16 = 1,
+    SD_TYPE_F32  = 0,
+    SD_TYPE_F16  = 1,
     SD_TYPE_Q4_0 = 2,
     SD_TYPE_Q4_1 = 3,
     // SD_TYPE_Q4_2 = 4, support has been removed
@@ -65,12 +65,12 @@ enum sd_type_t {
     SD_TYPE_Q8_0 = 8,
     SD_TYPE_Q8_1 = 9,
     // k-quantizations
-    SD_TYPE_Q2_K = 10,
-    SD_TYPE_Q3_K = 11,
-    SD_TYPE_Q4_K = 12,
-    SD_TYPE_Q5_K = 13,
-    SD_TYPE_Q6_K = 14,
-    SD_TYPE_Q8_K = 15,
+    SD_TYPE_Q2_K    = 10,
+    SD_TYPE_Q3_K    = 11,
+    SD_TYPE_Q4_K    = 12,
+    SD_TYPE_Q5_K    = 13,
+    SD_TYPE_Q6_K    = 14,
+    SD_TYPE_Q8_K    = 15,
     SD_TYPE_IQ2_XXS = 16,
     SD_TYPE_I8,
     SD_TYPE_I16,
@@ -78,7 +78,7 @@ enum sd_type_t {
     SD_TYPE_COUNT,
 };
 
-SD_API const char *sd_type_name(enum sd_type_t type);
+SD_API const char* sd_type_name(enum sd_type_t type);
 
 enum sd_log_level_t {
     SD_LOG_DEBUG,
@@ -87,36 +87,36 @@ enum sd_log_level_t {
     SD_LOG_ERROR
 };
 
-typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char *text, void *data);
+typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
 
-SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void *data);
+SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
 SD_API int32_t get_num_physical_cores();
-SD_API const char *sd_get_system_info();
+SD_API const char* sd_get_system_info();
 
 typedef struct {
     uint32_t width;
     uint32_t height;
     uint32_t channel;
-    uint8_t *data;
+    uint8_t* data;
 } sd_image_t;
 
 typedef struct sd_ctx_t sd_ctx_t;
 
-SD_API sd_ctx_t *new_sd_ctx(int n_threads,
+SD_API sd_ctx_t* new_sd_ctx(int n_threads,
                             bool vae_decode_only,
                             bool free_params_immediately,
-                            const char *lora_model_dir_c_str,
+                            const char* lora_model_dir_c_str,
                             enum rng_type_t rng_type,
                             bool vae_tiling,
                             enum sd_type_t wtype,
                             enum schedule_t s,
                             bool init_backend_immediately = true);
 
-SD_API void free_sd_ctx(sd_ctx_t *sd_ctx);
+SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
 
-SD_API sd_image_t *txt2img(sd_ctx_t *sd_ctx,
-                           const char *prompt,
-                           const char *negative_prompt,
+SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
+                           const char* prompt,
+                           const char* negative_prompt,
                            int clip_skip,
                            float cfg_scale,
                            int width,
@@ -126,10 +126,10 @@ SD_API sd_image_t *txt2img(sd_ctx_t *sd_ctx,
                            int64_t seed,
                            int batch_count);
 
-SD_API sd_image_t *img2img(sd_ctx_t *sd_ctx,
+SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
                            sd_image_t init_image,
-                           const char *prompt,
-                           const char *negative_prompt,
+                           const char* prompt,
+                           const char* negative_prompt,
                            int clip_skip,
                            float cfg_scale,
                            int width,
@@ -142,46 +142,46 @@ SD_API sd_image_t *img2img(sd_ctx_t *sd_ctx,
 
 typedef struct upscaler_ctx_t upscaler_ctx_t;
 
-SD_API upscaler_ctx_t *new_upscaler_ctx(const char *esrgan_path,
+SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
                                         int n_threads,
                                         enum sd_type_t wtype);
-SD_API void free_upscaler_ctx(upscaler_ctx_t *upscaler_ctx);
+SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
 
-SD_API sd_image_t upscale(upscaler_ctx_t *upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor);
+SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor);
 
-SD_API void init_backend(sd_ctx_t *sd_ctx);
+SD_API void init_backend(sd_ctx_t* sd_ctx);
 
-SD_API void set_options(sd_ctx_t *sd_ctx,
+SD_API void set_options(sd_ctx_t* sd_ctx,
                         int n_threads,
                         bool vae_decode_only,
                         bool free_params_immediately,
-                        const char *lora_model_dir,
+                        const char* lora_model_dir,
                         rng_type_t rng_type,
                         bool vae_tiling,
                         sd_type_t wtype,
                         schedule_t schedule);
 
-SD_API bool load_clip_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix = "te.");
+SD_API bool load_clip_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* prefix = "te.");
 
-SD_API void free_clip_params(sd_ctx_t *sd_ctx);
+SD_API void free_clip_params(sd_ctx_t* sd_ctx);
 
-SD_API bool load_unet_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix = "unet.");
+SD_API bool load_unet_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* prefix = "unet.");
 
-SD_API void free_unet_params(sd_ctx_t *sd_ctx);
+SD_API void free_unet_params(sd_ctx_t* sd_ctx);
 
-SD_API bool load_vae_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix = "vae.");
+SD_API bool load_vae_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* prefix = "vae.");
 
-SD_API void free_vae_params(sd_ctx_t *sd_ctx);
+SD_API void free_vae_params(sd_ctx_t* sd_ctx);
 
-SD_API bool load_taesd_from_file(sd_ctx_t *sd_ctx, const char *model_path);
+SD_API bool load_taesd_from_file(sd_ctx_t* sd_ctx, const char* model_path);
 
-SD_API void free_taesd_params(sd_ctx_t *sd_ctx);
+SD_API void free_taesd_params(sd_ctx_t* sd_ctx);
 
-SD_API bool load_diffusions_from_file(sd_ctx_t *sd_ctx, const char *model_path);
+SD_API bool load_diffusions_from_file(sd_ctx_t* sd_ctx, const char* model_path);
 
-SD_API void free_diffusions_params(sd_ctx_t *sd_ctx);
+SD_API void free_diffusions_params(sd_ctx_t* sd_ctx);
 
-SD_API bool convert(const char *input_path, const char *vae_path, const char *output_path, sd_type_t output_type);
+SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type);
 
 #ifdef __cplusplus
 }
diff --git a/unet.hpp b/unet.hpp
index 6b6e7439..19bb5553 100644
--- a/unet.hpp
+++ b/unet.hpp
@@ -390,7 +390,7 @@ struct SpatialTransformer {
 #if defined(SD_USE_FLASH_ATTENTION) && !defined(SD_USE_CUBLAS) && !defined(SD_USE_METAL)
                 struct ggml_tensor* kqv = ggml_flash_attn(ctx, q, k, v, false);  // [N * n_head, h * w, d_head]
 #else
-                struct ggml_tensor* kq  = ggml_mul_mat(ctx, k, q);   // [N * n_head, h * w, max_position]
+                struct ggml_tensor* kq = ggml_mul_mat(ctx, k, q);  // [N * n_head, h * w, max_position]
                 // kq = ggml_diag_mask_inf_inplace(ctx, kq, 0);
                 kq = ggml_soft_max_inplace(ctx, kq);
 
@@ -418,15 +418,15 @@ struct SpatialTransformer {
             {
                 // GEGLU
                 auto x_w    = ggml_view_2d(ctx,
-                                        transformer.ff_0_proj_w,
-                                        transformer.ff_0_proj_w->ne[0],
-                                        transformer.ff_0_proj_w->ne[1] / 2,
-                                        transformer.ff_0_proj_w->nb[1],
-                                        0);  // [in_channels * 4, in_channels]
+                                           transformer.ff_0_proj_w,
+                                           transformer.ff_0_proj_w->ne[0],
+                                           transformer.ff_0_proj_w->ne[1] / 2,
+                                           transformer.ff_0_proj_w->nb[1],
+                                           0);  // [in_channels * 4, in_channels]
                 auto x_b    = ggml_view_1d(ctx,
-                                        transformer.ff_0_proj_b,
-                                        transformer.ff_0_proj_b->ne[0] / 2,
-                                        0);  // [in_channels * 4, in_channels]
+                                           transformer.ff_0_proj_b,
+                                           transformer.ff_0_proj_b->ne[0] / 2,
+                                           0);  // [in_channels * 4, in_channels]
                 auto gate_w = ggml_view_2d(ctx,
                                            transformer.ff_0_proj_w,
                                            transformer.ff_0_proj_w->ne[0],

From 8e973464174f8308ede35602c13f6349d5a6e8b9 Mon Sep 17 00:00:00 2001
From: Cyberhan123 <255542417@qq.com>
Date: Wed, 24 Jan 2024 18:01:34 +0800
Subject: [PATCH 3/8] cli

---
 examples/cli/main.cpp | 537 +++++++++++++++++-------------
 stable-diffusion.cpp  | 746 ++++++++++++++++++++++--------------------
 2 files changed, 702 insertions(+), 581 deletions(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index b08340b3..31893751 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -5,6 +5,7 @@
 #include <random>
 #include <string>
 #include <vector>
+#include <sstream>
 
 #include "stable-diffusion.h"
 
@@ -17,49 +18,53 @@
 
 #include "stb_image_write.h"
 
-const char* rng_type_to_str[] = {
-    "std_default",
-    "cuda",
+const char *rng_type_to_str[] = {
+        "std_default",
+        "cuda",
 };
 
 // Names of the sampler method, same order as enum sample_method in stable-diffusion.h
-const char* sample_method_str[] = {
-    "euler_a",
-    "euler",
-    "heun",
-    "dpm2",
-    "dpm++2s_a",
-    "dpm++2m",
-    "dpm++2mv2",
-    "lcm",
+const char *sample_method_str[] = {
+        "euler_a",
+        "euler",
+        "heun",
+        "dpm2",
+        "dpm++2s_a",
+        "dpm++2m",
+        "dpm++2mv2",
+        "lcm",
 };
 
 // Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
-const char* schedule_str[] = {
-    "default",
-    "discrete",
-    "karras",
+const char *schedule_str[] = {
+        "default",
+        "discrete",
+        "karras",
 };
 
-const char* modes_str[] = {
-    "txt2img",
-    "img2img",
-    "convert",
+const char *modes_str[] = {
+        "txt2img",
+        "img2img",
+        "convert",
+        "stream"
 };
 
 enum SDMode {
     TXT2IMG,
     IMG2IMG,
     CONVERT,
+    STREAM,
     MODE_COUNT
 };
 
 struct SDParams {
     int n_threads = -1;
-    SDMode mode   = TXT2IMG;
+    SDMode mode = TXT2IMG;
 
     std::string model_path;
     std::string vae_path;
+    std::string clip_path;
+    std::string unet_path;
     std::string taesd_path;
     std::string esrgan_path;
     sd_type_t wtype = SD_TYPE_COUNT;
@@ -70,22 +75,23 @@ struct SDParams {
     std::string prompt;
     std::string negative_prompt;
     float cfg_scale = 7.0f;
-    int clip_skip   = -1;  // <= 0 represents unspecified
-    int width       = 512;
-    int height      = 512;
+    int clip_skip = -1;  // <= 0 represents unspecified
+    int width = 512;
+    int height = 512;
     int batch_count = 1;
 
     sample_method_t sample_method = EULER_A;
-    schedule_t schedule           = DEFAULT;
-    int sample_steps              = 20;
-    float strength                = 0.75f;
-    rng_type_t rng_type           = CUDA_RNG;
-    int64_t seed                  = 42;
-    bool verbose                  = false;
-    bool vae_tiling               = false;
+    schedule_t schedule = DEFAULT;
+    int sample_steps = 20;
+    float strength = 0.75f;
+    rng_type_t rng_type = CUDA_RNG;
+    int64_t seed = 42;
+    bool verbose = false;
+    bool vae_tiling = false;
+    bool vae_decode_only = false;
 };
 
-static std::string sd_basename(const std::string& path) {
+static std::string sd_basename(const std::string &path) {
     size_t pos = path.find_last_of('/');
     if (pos != std::string::npos) {
         return path.substr(pos + 1);
@@ -104,6 +110,8 @@ void print_params(SDParams params) {
     printf("    model_path:        %s\n", params.model_path.c_str());
     printf("    wtype:             %s\n", params.wtype < SD_TYPE_COUNT ? sd_type_name(params.wtype) : "unspecified");
     printf("    vae_path:          %s\n", params.vae_path.c_str());
+    printf("    clip_path:         %s\n", params.clip_path.c_str());
+    printf("    unet_path:         %s\n", params.unet_path.c_str());
     printf("    taesd_path:        %s\n", params.taesd_path.c_str());
     printf("    esrgan_path:       %s\n", params.esrgan_path.c_str());
     printf("    output_path:       %s\n", params.output_path.c_str());
@@ -124,16 +132,19 @@ void print_params(SDParams params) {
     printf("    vae_tiling:        %s\n", params.vae_tiling ? "true" : "false");
 }
 
-void print_usage(int argc, const char* argv[]) {
+void print_usage(int argc, const char *argv[]) {
     printf("usage: %s [arguments]\n", argv[0]);
     printf("\n");
     printf("arguments:\n");
     printf("  -h, --help                         show this help message and exit\n");
-    printf("  -M, --mode [MODEL]                 run mode (txt2img or img2img or convert, default: txt2img)\n");
+    printf("  -M, --mode [MODEL]                 run mode (txt2img or img2img or convert or stream, default: txt2img)\n");
     printf("  -t, --threads N                    number of threads to use during computation (default: -1).\n");
     printf("                                     If threads <= 0, then threads will be set to the number of CPU physical cores\n");
     printf("  -m, --model [MODEL]                path to model\n");
+    printf("                                     If the path is directory, support load model from \"unet/diffusion_pytorch_model.safetensors\", \"vae/diffusion_pytorch_model.safetensors\",\"text_encoder/model.safetensors\"\n");
     printf("  --vae [VAE]                        path to vae\n");
+    printf("  --clip [CLIP]                      path to clip\n");
+    printf("  --unet [UNET]                      path to unet\n");
     printf("  --taesd [TAESD_PATH]               path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n");
     printf("  --upscale-model [ESRGAN_PATH]      path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now.\n");
     printf("  --type [TYPE]                      weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0)\n");
@@ -148,7 +159,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("                                     1.0 corresponds to full destruction of information in init image\n");
     printf("  -H, --height H                     image height, in pixel space (default: 512)\n");
     printf("  -W, --width W                      image width, in pixel space (default: 512)\n");
-    printf("  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, lcm}\n");
+    printf("  --sampling-method                  {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, lcm}\n");
     printf("                                     sampling method (default: \"euler_a\")\n");
     printf("  --steps  STEPS                     number of sample steps (default: 20)\n");
     printf("  --rng {std_default, cuda}          RNG (default: cuda)\n");
@@ -161,7 +172,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("  -v, --verbose                      print extra info\n");
 }
 
-void parse_args(int argc, const char** argv, SDParams& params) {
+void parse_args(int argc, const char **argv, SDParams &params) {
     bool invalid_arg = false;
     std::string arg;
     for (int i = 1; i < argc; i++) {
@@ -178,19 +189,19 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 invalid_arg = true;
                 break;
             }
-            const char* mode_selected = argv[i];
-            int mode_found            = -1;
+            const char *mode_selected = argv[i];
+            int mode_found = -1;
             for (int d = 0; d < MODE_COUNT; d++) {
                 if (!strcmp(mode_selected, modes_str[d])) {
                     mode_found = d;
                 }
             }
             if (mode_found == -1) {
-                fprintf(stderr, "error: invalid mode %s, must be one of [txt2img, img2img]\n",
+                fprintf(stderr, "error: invalid mode %s, must be one of [txt2img, img2img, convert, txt2img]\n",
                         mode_selected);
                 exit(1);
             }
-            params.mode = (SDMode)mode_found;
+            params.mode = (SDMode) mode_found;
         } else if (arg == "-m" || arg == "--model") {
             if (++i >= argc) {
                 invalid_arg = true;
@@ -203,6 +214,18 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 break;
             }
             params.vae_path = argv[i];
+        } else if (arg == "--clip") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.vae_path = argv[i];
+        } else if (arg == "--unet") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.vae_path = argv[i];
         } else if (arg == "--taesd") {
             if (++i >= argc) {
                 invalid_arg = true;
@@ -334,8 +357,8 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 invalid_arg = true;
                 break;
             }
-            const char* schedule_selected = argv[i];
-            int schedule_found            = -1;
+            const char *schedule_selected = argv[i];
+            int schedule_found = -1;
             for (int d = 0; d < N_SCHEDULES; d++) {
                 if (!strcmp(schedule_selected, schedule_str[d])) {
                     schedule_found = d;
@@ -345,7 +368,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 invalid_arg = true;
                 break;
             }
-            params.schedule = (schedule_t)schedule_found;
+            params.schedule = (schedule_t) schedule_found;
         } else if (arg == "-s" || arg == "--seed") {
             if (++i >= argc) {
                 invalid_arg = true;
@@ -357,8 +380,8 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 invalid_arg = true;
                 break;
             }
-            const char* sample_method_selected = argv[i];
-            int sample_method_found            = -1;
+            const char *sample_method_selected = argv[i];
+            int sample_method_found = -1;
             for (int m = 0; m < N_SAMPLE_METHODS; m++) {
                 if (!strcmp(sample_method_selected, sample_method_str[m])) {
                     sample_method_found = m;
@@ -368,7 +391,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 invalid_arg = true;
                 break;
             }
-            params.sample_method = (sample_method_t)sample_method_found;
+            params.sample_method = (sample_method_t) sample_method_found;
         } else if (arg == "-h" || arg == "--help") {
             print_usage(argc, argv);
             exit(0);
@@ -385,62 +408,65 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         print_usage(argc, argv);
         exit(1);
     }
+
     if (params.n_threads <= 0) {
         params.n_threads = get_num_physical_cores();
     }
 
-    if (params.mode != CONVERT && params.prompt.length() == 0) {
-        fprintf(stderr, "error: the following arguments are required: prompt\n");
-        print_usage(argc, argv);
-        exit(1);
-    }
+    if (params.mode != STREAM) {
+        if (params.mode != CONVERT && params.prompt.length() == 0) {
+            fprintf(stderr, "error: the following arguments are required: prompt\n");
+            print_usage(argc, argv);
+            exit(1);
+        }
 
-    if (params.model_path.length() == 0) {
-        fprintf(stderr, "error: the following arguments are required: model_path\n");
-        print_usage(argc, argv);
-        exit(1);
-    }
+        if (params.model_path.length() == 0) {
+            fprintf(stderr, "error: the following arguments are required: model_path\n");
+            print_usage(argc, argv);
+            exit(1);
+        }
 
-    if (params.mode == IMG2IMG && params.input_path.length() == 0) {
-        fprintf(stderr, "error: when using the img2img mode, the following arguments are required: init-img\n");
-        print_usage(argc, argv);
-        exit(1);
-    }
+        if (params.mode == IMG2IMG && params.input_path.length() == 0) {
+            fprintf(stderr, "error: when using the img2img mode, the following arguments are required: init-img\n");
+            print_usage(argc, argv);
+            exit(1);
+        }
 
-    if (params.output_path.length() == 0) {
-        fprintf(stderr, "error: the following arguments are required: output_path\n");
-        print_usage(argc, argv);
-        exit(1);
-    }
+        if (params.output_path.length() == 0) {
+            fprintf(stderr, "error: the following arguments are required: output_path\n");
+            print_usage(argc, argv);
+            exit(1);
+        }
 
-    if (params.width <= 0 || params.width % 64 != 0) {
-        fprintf(stderr, "error: the width must be a multiple of 64\n");
-        exit(1);
-    }
+        if (params.width <= 0 || params.width % 64 != 0) {
+            fprintf(stderr, "error: the width must be a multiple of 64\n");
+            exit(1);
+        }
 
-    if (params.height <= 0 || params.height % 64 != 0) {
-        fprintf(stderr, "error: the height must be a multiple of 64\n");
-        exit(1);
-    }
+        if (params.height <= 0 || params.height % 64 != 0) {
+            fprintf(stderr, "error: the height must be a multiple of 64\n");
+            exit(1);
+        }
 
-    if (params.sample_steps <= 0) {
-        fprintf(stderr, "error: the sample_steps must be greater than 0\n");
-        exit(1);
-    }
+        if (params.sample_steps <= 0) {
+            fprintf(stderr, "error: the sample_steps must be greater than 0\n");
+            exit(1);
+        }
 
-    if (params.strength < 0.f || params.strength > 1.f) {
-        fprintf(stderr, "error: can only work with strength in [0.0, 1.0]\n");
-        exit(1);
-    }
+        if (params.strength < 0.f || params.strength > 1.f) {
+            fprintf(stderr, "error: can only work with strength in [0.0, 1.0]\n");
+            exit(1);
+        }
 
-    if (params.seed < 0) {
-        srand((int)time(NULL));
-        params.seed = rand();
-    }
+        if (params.seed < 0) {
+            srand((int) time(NULL));
+            params.seed = rand();
+        }
 
-    if (params.mode == CONVERT) {
-        if (params.output_path == "output.png") {
-            params.output_path = "output.gguf";
+        if (params.mode == CONVERT) {
+            if (params.output_path == "output.png") {
+                params.output_path = "output.gguf";
+            }
         }
     }
 }
@@ -465,8 +491,8 @@ std::string get_image_params(SDParams params, int64_t seed) {
     return parameter_string;
 }
 
-void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
-    SDParams* params = (SDParams*)data;
+void sd_log_cb(enum sd_log_level_t level, const char *log, void *data) {
+    SDParams *params = (SDParams *) data;
     if (!params->verbose && level <= SD_LOG_DEBUG) {
         return;
     }
@@ -479,182 +505,243 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
     }
 }
 
-int main(int argc, const char* argv[]) {
-    SDParams params;
-    parse_args(argc, argv, params);
+std::vector<std::string> parse_cin(std::string &input, std::vector<std::string> ignore_args) {
+    std::vector<std::string> inputTokens;
+    std::string token;
+    std::istringstream iss(input);
 
-    sd_set_log_callback(sd_log_cb, (void*)&params);
-
-    if (params.verbose) {
-        print_params(params);
-        printf("%s", sd_get_system_info());
+    std::string word;
+    while (iss >> word) {
+        inputTokens.push_back(word);
     }
 
-    if (params.mode == CONVERT) {
-        bool success = convert(params.model_path.c_str(),
-                               params.vae_path.c_str(),
-                               params.output_path.c_str(),
-                               params.wtype);
-        if (!success) {
-            fprintf(stderr,
-                    "convert '%s'/'%s' to '%s' failed\n",
-                    params.model_path.c_str(),
-                    params.vae_path.c_str(),
-                    params.output_path.c_str());
-            return 1;
-        } else {
-            printf("convert '%s'/'%s' to '%s' success\n",
-                   params.model_path.c_str(),
-                   params.vae_path.c_str(),
-                   params.output_path.c_str());
-            return 0;
+    std::vector<std::string> commands;
+    for (int i = 0; i < inputTokens.size(); i++) {
+
+        if (std::find(ignore_args.begin(), ignore_args.end(), inputTokens[i]) != ignore_args.end()) {
+            i++;
+            continue;
         }
+        commands.push_back(inputTokens[i]);
     }
+    return commands;
+}
 
-    bool vae_decode_only        = true;
-    uint8_t* input_image_buffer = NULL;
-    if (params.mode == IMG2IMG) {
-        vae_decode_only = false;
+class CliInstance {
+public:
+    sd_ctx_t *sd_ctx;
 
-        int c              = 0;
+    ~CliInstance() {
+        free_sd_ctx(sd_ctx);
+    }
+
+    CliInstance(const SDParams &params) {
+        sd_ctx = new_sd_ctx(
+                params.n_threads,
+                params.vae_decode_only,
+                true,
+                params.lora_model_dir.c_str(),
+                params.rng_type,
+                params.vae_tiling,
+                params.wtype,
+                params.schedule,
+                true);
+    }
+
+    //TODO: dynamic load model
+
+    void txtimg(SDParams &params) {
+        set_options(sd_ctx, params.n_threads,
+                    params.vae_decode_only,
+                    true,
+                    params.lora_model_dir.c_str(),
+                    params.rng_type,
+                    params.vae_tiling,
+                    params.wtype,
+                    params.schedule);
+        sd_image_t *results = txt2img(sd_ctx,
+                                      params.prompt.c_str(),
+                                      params.negative_prompt.c_str(),
+                                      params.clip_skip,
+                                      params.cfg_scale,
+                                      params.width,
+                                      params.height,
+                                      params.sample_method,
+                                      params.sample_steps,
+                                      params.seed,
+                                      params.batch_count);
+        results = upscaler(params, results);
+        save_image(params, results);
+
+    }
+
+    void imgimg(SDParams &params) {
+        set_options(sd_ctx, params.n_threads,
+                    params.vae_decode_only,
+                    true,
+                    params.lora_model_dir.c_str(),
+                    params.rng_type,
+                    params.vae_tiling,
+                    params.wtype,
+                    params.schedule);
+        uint8_t *input_image_buffer = NULL;
+
+        int c = 0;
         input_image_buffer = stbi_load(params.input_path.c_str(), &params.width, &params.height, &c, 3);
         if (input_image_buffer == NULL) {
             fprintf(stderr, "load image from '%s' failed\n", params.input_path.c_str());
-            return 1;
+            return;
         }
         if (c != 3) {
             fprintf(stderr, "input image must be a 3 channels RGB image, but got %d channels\n", c);
             free(input_image_buffer);
-            return 1;
+            return;
         }
         if (params.width <= 0 || params.width % 64 != 0) {
             fprintf(stderr, "error: the width of image must be a multiple of 64\n");
             free(input_image_buffer);
-            return 1;
+            return;
         }
+
         if (params.height <= 0 || params.height % 64 != 0) {
             fprintf(stderr, "error: the height of image must be a multiple of 64\n");
             free(input_image_buffer);
-            return 1;
+            return;
         }
-    }
 
-    sd_ctx_t* sd_ctx = new_sd_ctx(
-        params.n_threads,
-        vae_decode_only,
-        true,
-        params.lora_model_dir.c_str(),
-        params.rng_type,
-        params.vae_tiling,
-        params.wtype,
-        params.schedule,
-        true);
-
-    if (sd_ctx == NULL) {
-        printf("new_sd_ctx_t failed\n");
-        return 1;
-    }
+        sd_image_t input_image = {(uint32_t) params.width,
+                                  (uint32_t) params.height,
+                                  3,
+                                  input_image_buffer};
 
-    if (!load_diffusions_from_file(sd_ctx, params.model_path.c_str())) {
-        printf("load diffusions model failed\n");
-        return 1;
+        sd_image_t *results = img2img(sd_ctx,
+                                      input_image,
+                                      params.prompt.c_str(),
+                                      params.negative_prompt.c_str(),
+                                      params.clip_skip,
+                                      params.cfg_scale,
+                                      params.width,
+                                      params.height,
+                                      params.sample_method,
+                                      params.sample_steps,
+                                      params.strength,
+                                      params.seed,
+                                      params.batch_count);
+        results = upscaler(params, results);
+        save_image(params, results);
     }
 
-    if (!params.taesd_path.empty()) {
-        free_unet_params(sd_ctx);
-        if (!load_taesd_from_file(sd_ctx, params.taesd_path.c_str())) {
-            printf("load taesd model failed\n");
-            return 1;
+protected:
+
+    void save_image(const SDParams &params, sd_image_t *results) {
+        size_t last = params.output_path.find_last_of(".");
+        std::string dummy_name = last != std::string::npos ? params.output_path.substr(0, last) : params.output_path;
+        for (int i = 0; i < params.batch_count; i++) {
+            if (results[i].data == NULL) {
+                continue;
+            }
+            std::string final_image_path =
+                    i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ".png" : dummy_name + ".png";
+            stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
+                           results[i].data, 0, get_image_params(params, params.seed + i).c_str());
+            printf("save result image to '%s'\n", final_image_path.c_str());
+            free(results[i].data);
+            results[i].data = NULL;
         }
+        free(results);
     }
 
-    if (!params.vae_path.empty()) {
-        free_vae_params(sd_ctx);
-        if (!load_vae_from_file(sd_ctx, params.vae_path.c_str())) {
-            printf("load vae model failed\n");
-            return 1;
+    sd_image_t *upscaler(const SDParams &params, sd_image_t *results) {
+        int upscale_factor = 4;  // unused for RealESRGAN_x4plus_anime_6B.pth
+        if (params.esrgan_path.size() > 0) {
+            upscaler_ctx_t *upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(),
+                                                            params.n_threads,
+                                                            params.wtype);
+
+            if (upscaler_ctx == NULL) {
+                printf("new_upscaler_ctx failed\n");
+            } else {
+                for (int i = 0; i < params.batch_count; i++) {
+                    if (results[i].data == NULL) {
+                        continue;
+                    }
+                    sd_image_t upscaled_image = upscale(upscaler_ctx, results[i], upscale_factor);
+                    if (upscaled_image.data == NULL) {
+                        printf("upscale failed\n");
+                        continue;
+                    }
+                    free(results[i].data);
+                    results[i] = upscaled_image;
+                }
+                free_upscaler_ctx(upscaler_ctx);
+            }
         }
+        return results;
     }
+};
 
-    sd_image_t* results;
-    if (params.mode == TXT2IMG) {
-        results = txt2img(sd_ctx,
-                          params.prompt.c_str(),
-                          params.negative_prompt.c_str(),
-                          params.clip_skip,
-                          params.cfg_scale,
-                          params.width,
-                          params.height,
-                          params.sample_method,
-                          params.sample_steps,
-                          params.seed,
-                          params.batch_count);
-    } else {
-        sd_image_t input_image = {(uint32_t)params.width,
-                                  (uint32_t)params.height,
-                                  3,
-                                  input_image_buffer};
+int main(int argc, const char *argv[]) {
+    SDParams params;
+    parse_args(argc, argv, params);
 
-        results = img2img(sd_ctx,
-                          input_image,
-                          params.prompt.c_str(),
-                          params.negative_prompt.c_str(),
-                          params.clip_skip,
-                          params.cfg_scale,
-                          params.width,
-                          params.height,
-                          params.sample_method,
-                          params.sample_steps,
-                          params.strength,
-                          params.seed,
-                          params.batch_count);
-    }
+    sd_set_log_callback(sd_log_cb, (void *) &params);
 
-    if (results == NULL) {
-        printf("generate failed\n");
-        free_sd_ctx(sd_ctx);
-        return 1;
+    if (params.verbose) {
+        print_params(params);
+        printf("%s", sd_get_system_info());
     }
 
-    int upscale_factor = 4;  // unused for RealESRGAN_x4plus_anime_6B.pth
-    if (params.esrgan_path.size() > 0) {
-        upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(),
-                                                        params.n_threads,
-                                                        params.wtype);
-
-        if (upscaler_ctx == NULL) {
-            printf("new_upscaler_ctx failed\n");
+    if (params.mode == CONVERT) {
+        bool success = convert(params.model_path.c_str(),
+                               params.vae_path.c_str(),
+                               params.output_path.c_str(),
+                               params.wtype);
+        if (!success) {
+            fprintf(stderr,
+                    "convert '%s'/'%s' to '%s' failed\n",
+                    params.model_path.c_str(),
+                    params.vae_path.c_str(),
+                    params.output_path.c_str());
+            return 1;
         } else {
-            for (int i = 0; i < params.batch_count; i++) {
-                if (results[i].data == NULL) {
-                    continue;
-                }
-                sd_image_t upscaled_image = upscale(upscaler_ctx, results[i], upscale_factor);
-                if (upscaled_image.data == NULL) {
-                    printf("upscale failed\n");
-                    continue;
-                }
-                free(results[i].data);
-                results[i] = upscaled_image;
-            }
+            printf("convert '%s'/'%s' to '%s' success\n",
+                   params.model_path.c_str(),
+                   params.vae_path.c_str(),
+                   params.output_path.c_str());
+            return 0;
         }
     }
 
-    size_t last            = params.output_path.find_last_of(".");
-    std::string dummy_name = last != std::string::npos ? params.output_path.substr(0, last) : params.output_path;
-    for (int i = 0; i < params.batch_count; i++) {
-        if (results[i].data == NULL) {
-            continue;
+    auto instance = new CliInstance(params);
+    if (params.mode == STREAM) {
+        while (true) {
+            std::cout << "you are in stream model, take free to use txt2img or img2img" << std::endl;
+            std::string input;
+            std::getline(std::cin, input);
+            std::vector<std::string> ignore_cmd = {""};
+            auto args = parse_cin(input, ignore_cmd);
+            SDParams stream_params;
+            const char **args_c_arr = new const char *[args.size()];
+            for (int i = 0; i < args.size(); ++i) {
+                args_c_arr[i] = args[i].c_str();
+            }
+            parse_args(args.size(), args_c_arr, stream_params);
+            if (stream_params.mode == TXT2IMG) {
+                instance->txtimg(stream_params);
+            } else if (stream_params.mode == IMG2IMG) {
+                instance->imgimg(stream_params);
+            } else {
+                exit(1);
+            }
+        }
+    } else {
+        if (params.mode == TXT2IMG) {
+            instance->txtimg(params);
+        } else if (params.mode == IMG2IMG) {
+            instance->imgimg(params);
+        } else {
+            exit(1);
         }
-        std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ".png" : dummy_name + ".png";
-        stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
-                       results[i].data, 0, get_image_params(params, params.seed + i).c_str());
-        printf("save result image to '%s'\n", final_image_path.c_str());
-        free(results[i].data);
-        results[i].data = NULL;
     }
-    free(results);
-    free_sd_ctx(sd_ctx);
-
     return 0;
 }
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 28f5d8c8..e3090803 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -14,35 +14,35 @@
 #include "unet.hpp"
 #include "vae.hpp"
 
-const char* model_version_to_str[] = {
-    "1.x",
-    "2.x",
-    "XL",
+const char *model_version_to_str[] = {
+        "1.x",
+        "2.x",
+        "XL",
 };
 
-const char* sampling_methods_str[] = {
-    "Euler A",
-    "Euler",
-    "Heun",
-    "DPM2",
-    "DPM++ (2s)",
-    "DPM++ (2M)",
-    "modified DPM++ (2M)",
-    "LCM",
+const char *sampling_methods_str[] = {
+        "Euler A",
+        "Euler",
+        "Heun",
+        "DPM2",
+        "DPM++ (2s)",
+        "DPM++ (2M)",
+        "modified DPM++ (2M)",
+        "LCM",
 };
 
 /*================================================== Helper Functions ================================================*/
 
-void calculate_alphas_cumprod(float* alphas_cumprod,
+void calculate_alphas_cumprod(float *alphas_cumprod,
                               float linear_start = 0.00085f,
-                              float linear_end   = 0.0120,
-                              int timesteps      = TIMESTEPS) {
+                              float linear_end = 0.0120,
+                              int timesteps = TIMESTEPS) {
     float ls_sqrt = sqrtf(linear_start);
     float le_sqrt = sqrtf(linear_end);
-    float amount  = le_sqrt - ls_sqrt;
+    float amount = le_sqrt - ls_sqrt;
     float product = 1.0f;
     for (int i = 0; i < timesteps; i++) {
-        float beta = ls_sqrt + amount * ((float)i / (timesteps - 1));
+        float beta = ls_sqrt + amount * ((float) i / (timesteps - 1));
         product *= 1.0f - powf(beta, 2.0f);
         alphas_cumprod[i] = product;
     }
@@ -53,20 +53,20 @@ void calculate_alphas_cumprod(float* alphas_cumprod,
 class StableDiffusionGGML {
 public:
     SDVersion version;
-    bool vae_decode_only         = false;
+    bool vae_decode_only = false;
     bool free_params_immediately = false;
 
     std::shared_ptr<RNG> rng = std::make_shared<STDDefaultRNG>();
-    int n_threads            = -1;
-    float scale_factor       = 0.18215f;
+    int n_threads = -1;
+    float scale_factor = 0.18215f;
 
     FrozenCLIPEmbedderWithCustomWords cond_stage_model;
     UNetModel diffusion_model;
     AutoEncoderKL first_stage_model;
     bool use_tiny_autoencoder = false;
-    bool vae_tiling           = false;
+    bool vae_tiling = false;
 
-    std::map<std::string, struct ggml_tensor*> tensors;
+    std::map<std::string, struct ggml_tensor *> tensors;
 
     std::string lora_model_dir;
     // lora_name => multiplier
@@ -74,13 +74,17 @@ class StableDiffusionGGML {
     std::map<std::string, LoraModel> loras;
 
     std::shared_ptr<Denoiser> denoiser = std::make_shared<CompVisDenoiser>();
-    schedule_t schedule                = DEFAULT;
+    schedule_t schedule = DEFAULT;
 
-    ggml_backend_t backend    = NULL;             // general backend
+    ggml_backend_t backend = NULL;             // general backend
     ggml_type model_data_type = GGML_TYPE_COUNT;  // runtime weight type
-    ggml_type wtype           = GGML_TYPE_COUNT;  // options weight type
+    ggml_type wtype = GGML_TYPE_COUNT;  // options weight type
 
     TinyAutoEncoder tae_first_stage;
+
+    std::string clip_path;
+    std::string vae_path;
+    std::string unet_path;
     std::string taesd_path;
 
     ModelLoader model_loader;
@@ -96,15 +100,15 @@ class StableDiffusionGGML {
                         ggml_type wtype,
                         schedule_t schedule,
                         bool init_backend_immediately = true)
-        : n_threads(n_threads),
-          vae_decode_only(vae_decode_only),
-          free_params_immediately(free_params_immediately),
-          lora_model_dir(lora_model_dir),
-          vae_tiling(vae_tiling),
-          wtype(wtype),
-          schedule(schedule) {
+            : n_threads(n_threads),
+              vae_decode_only(vae_decode_only),
+              free_params_immediately(free_params_immediately),
+              lora_model_dir(lora_model_dir),
+              vae_tiling(vae_tiling),
+              wtype(wtype),
+              schedule(schedule) {
         first_stage_model.decode_only = vae_decode_only;
-        tae_first_stage.decode_only   = vae_decode_only;
+        tae_first_stage.decode_only = vae_decode_only;
         if (rng_type == STD_DEFAULT_RNG) {
             rng = std::make_shared<STDDefaultRNG>();
         } else if (rng_type == CUDA_RNG) {
@@ -151,22 +155,41 @@ class StableDiffusionGGML {
                      bool vae_tiling,
                      sd_type_t wtype,
                      schedule_t schedule) {
-        this->n_threads               = n_threads;
-        this->vae_decode_only         = vae_decode_only;
+        this->n_threads = n_threads;
+        bool standalone=vae_path != clip_path && vae_path != unet_path;
+        if (this->vae_decode_only != vae_decode_only) {
+            this->vae_decode_only = vae_decode_only;
+            if (!vae_path.empty() && first_stage_model.params_buffer_size > 0) {
+                free_vae_params();
+                std::string prefix;
+                if (standalone) {
+                    prefix = ".vae";
+                }
+                load_vae_from_file(vae_path, standalone, prefix);
+            }
+        }
+
         this->free_params_immediately = free_params_immediately;
-        this->lora_model_dir          = lora_model_dir;
+        this->lora_model_dir = std::move(lora_model_dir);
         if (rng_type == STD_DEFAULT_RNG) {
             rng = std::make_shared<STDDefaultRNG>();
         } else if (rng_type == CUDA_RNG) {
             rng = std::make_shared<PhiloxRNG>();
         }
         this->vae_tiling = vae_tiling;
-        this->wtype      = (ggml_type)wtype;
-        this->schedule   = schedule;
-        apply_schedule();
+
+        if (this->wtype !=(ggml_type) wtype) {
+            this->wtype = (ggml_type) wtype;
+            // TODO: change wtype, need reload model
+        }
+
+        if (this->schedule!=schedule){
+            this->schedule = schedule;
+            apply_schedule();
+        }
     }
 
-    bool load_clip_from_file(const std::string& model_path, bool standalone = true, const std::string& prefix = "te.") {
+    bool load_clip_from_file(const std::string &model_path, bool standalone = true, const std::string &prefix = "te.") {
         if (backend == NULL) {
             LOG_ERROR("if you set init_backend_immediately false, please call init_backend first");
             return false;
@@ -232,11 +255,11 @@ class StableDiffusionGGML {
         }
 
         struct ggml_init_params params;
-        params.mem_size   = static_cast<size_t>(3 * 1024) * 1024;  // 10M
+        params.mem_size = static_cast<size_t>(3 * 1024) * 1024;  // 3M
         params.mem_buffer = NULL;
-        params.no_alloc   = false;
+        params.no_alloc = false;
         // LOG_DEBUG("mem_size %u ", params.mem_size);
-        struct ggml_context* ctx = ggml_init(params);  // for  alphas_cumprod and is_using_v_parameterization check
+        struct ggml_context *ctx = ggml_init(params);  // for  alphas_cumprod and is_using_v_parameterization check
         if (!ctx) {
             LOG_ERROR("ggml_init() failed");
             return false;
@@ -246,10 +269,10 @@ class StableDiffusionGGML {
         LOG_DEBUG("loading clip weights");
         int64_t t0 = ggml_time_ms();
 
-        std::map<std::string, struct ggml_tensor*> tensors_need_to_load;
+        std::map<std::string, struct ggml_tensor *> tensors_need_to_load;
         std::set<std::string> ignore_tensors;
 
-        for (auto& pair : tensors) {
+        for (auto &pair: tensors) {
             tensors_need_to_load.insert(pair);
         }
 
@@ -264,6 +287,7 @@ class StableDiffusionGGML {
         int64_t t1 = ggml_time_ms();
         LOG_INFO("loading clip model from '%s' completed, taking %.2fs", model_path.c_str(), (t1 - t0) * 1.0f / 1000);
         ggml_free(ctx);
+        clip_path = model_path;
         return true;
     }
 
@@ -273,9 +297,9 @@ class StableDiffusionGGML {
         }
     }
 
-    bool load_unet_from_file(const std::string& model_path,
-                             bool standalone           = true,
-                             const std::string& prefix = "unet.") {
+    bool load_unet_from_file(const std::string &model_path,
+                             bool standalone = true,
+                             const std::string &prefix = "unet.") {
         if (backend == NULL) {
             LOG_ERROR("if you set init_backend_immediately false, please call init_backend first");
             return false;
@@ -308,11 +332,11 @@ class StableDiffusionGGML {
         }
 
         struct ggml_init_params params;
-        params.mem_size   = static_cast<size_t>(3 * 1024) * 1024;  // 10M
+        params.mem_size = static_cast<size_t>(3 * 1024) * 1024;  // 3M
         params.mem_buffer = NULL;
-        params.no_alloc   = false;
+        params.no_alloc = false;
 
-        struct ggml_context* ctx = ggml_init(params);  // for  alphas_cumprod and is_using_v_parameterization check
+        struct ggml_context *ctx = ggml_init(params);  // for  alphas_cumprod and is_using_v_parameterization check
 
         if (!ctx) {
             LOG_ERROR("ggml_init() failed");
@@ -323,13 +347,13 @@ class StableDiffusionGGML {
         LOG_DEBUG("loading weights");
         int64_t t0 = ggml_time_ms();
 
-        std::map<std::string, struct ggml_tensor*> tensors_need_to_load;
+        std::map<std::string, struct ggml_tensor *> tensors_need_to_load;
         std::set<std::string> ignore_tensors;
-        ggml_tensor* alphas_cumprod_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, TIMESTEPS);
-        calculate_alphas_cumprod((float*)alphas_cumprod_tensor->data);
+        ggml_tensor *alphas_cumprod_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, TIMESTEPS);
+        calculate_alphas_cumprod((float *) alphas_cumprod_tensor->data);
         tensors_need_to_load["alphas_cumprod"] = alphas_cumprod_tensor;
-        for (auto& pair : tensors) {
-            const std::string& name = pair.first;
+        for (auto &pair: tensors) {
+            const std::string &name = pair.first;
             if (starts_with(name, "cond_stage_model.") || starts_with(name, "first_stage_model.")) {
                 ignore_tensors.insert(name);
                 continue;
@@ -362,6 +386,7 @@ class StableDiffusionGGML {
 
         apply_schedule();
         ggml_free(ctx);
+        unet_path = model_path;
         return true;
     }
 
@@ -371,9 +396,9 @@ class StableDiffusionGGML {
         }
     }
 
-    bool load_vae_from_file(const std::string& model_path,
-                            bool standalone           = true,
-                            const std::string& prefix = "vae.") {
+    bool load_vae_from_file(const std::string &model_path,
+                            bool standalone = true,
+                            const std::string &prefix = "vae.") {
         if (backend == NULL) {
             LOG_ERROR("if you set init_backend_immediately false, please call init_backend first");
             return false;
@@ -410,11 +435,11 @@ class StableDiffusionGGML {
         }
 
         struct ggml_init_params params;
-        params.mem_size   = static_cast<size_t>(10 * 1024) * 1024;  // 10M
+        params.mem_size = static_cast<size_t>(10 * 1024) * 1024;  // 3M
         params.mem_buffer = NULL;
-        params.no_alloc   = false;
+        params.no_alloc = false;
         // LOG_DEBUG("mem_size %u ", params.mem_size);
-        struct ggml_context* ctx = ggml_init(params);  // for  alphas_cumprod and is_using_v_parameterization check
+        struct ggml_context *ctx = ggml_init(params);  // for  alphas_cumprod and is_using_v_parameterization check
         if (!ctx) {
             LOG_ERROR("ggml_init() failed");
             return false;
@@ -424,17 +449,15 @@ class StableDiffusionGGML {
         LOG_DEBUG("loading weights");
         int64_t t0 = ggml_time_ms();
 
-        std::map<std::string, struct ggml_tensor*> tensors_need_to_load;
+        std::map<std::string, struct ggml_tensor *> tensors_need_to_load;
         std::set<std::string> ignore_tensors;
-        for (auto& pair : tensors) {
-            const std::string& name = pair.first;
-            // TODO: make it can reload in compute time. so we can set vae_decode_only dynamic.
+        for (auto &pair: tensors) {
+            const std::string &name = pair.first;
             if (vae_decode_only &&
                 (starts_with(name, "first_stage_model.encoder") || starts_with(name, "first_stage_model.quant"))) {
                 ignore_tensors.insert(name);
                 continue;
             }
-
             tensors_need_to_load.insert(pair);
         }
         bool success = model_loader.load_tensors(tensors_need_to_load, backend, ignore_tensors, standalone);
@@ -447,6 +470,7 @@ class StableDiffusionGGML {
         int64_t t1 = ggml_time_ms();
         LOG_INFO("loading vae model from '%s' completed, taking %.2fs", model_path.c_str(), (t1 - t0) * 1.0f / 1000);
         ggml_free(ctx);
+        vae_path = model_path;
         return true;
     }
 
@@ -457,7 +481,7 @@ class StableDiffusionGGML {
     }
 
     // load the all model from one file
-    bool load_diffusions_from_file(const std::string& model_path) {
+    bool load_diffusions_from_file(const std::string &model_path) {
         LOG_INFO("loading model from '%s'", model_path.c_str());
         if (!load_clip_from_file(model_path, false, "")) {
             free_clip_params();
@@ -491,13 +515,15 @@ class StableDiffusionGGML {
         LOG_INFO("free vae params");
     }
 
-    bool load_taesd_from_file(const std::string& taesd_path) {
+    bool load_taesd_from_file(const std::string &taesd_path) {
         if (first_stage_model.params_buffer_size > 0) {
             free_vae_params();
         }
         if (taesd_path.empty() || !tae_first_stage.load_from_file(taesd_path, backend)) {
             return false;
         }
+
+        this->taesd_path = taesd_path;
         use_tiny_autoencoder = true;
         return true;
     }
@@ -508,34 +534,34 @@ class StableDiffusionGGML {
         }
     }
 
-    bool is_using_v_parameterization_for_sd2(ggml_context* work_ctx) {
-        struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1);
+    bool is_using_v_parameterization_for_sd2(ggml_context *work_ctx) {
+        struct ggml_tensor *x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1);
         ggml_set_f32(x_t, 0.5);
-        struct ggml_tensor* c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1);
+        struct ggml_tensor *c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1);
         ggml_set_f32(c, 0.5);
 
-        struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32,
+        struct ggml_tensor *timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32,
                                                            1);  // [N, ]
-        struct ggml_tensor* t_emb     = new_timestep_embedding(work_ctx, NULL, timesteps,
-                                                               diffusion_model.model_channels);  // [N, model_channels]
+        struct ggml_tensor *t_emb = new_timestep_embedding(work_ctx, NULL, timesteps,
+                                                           diffusion_model.model_channels);  // [N, model_channels]
 
         int64_t t0 = ggml_time_ms();
         ggml_set_f32(timesteps, 999);
         set_timestep_embedding(timesteps, t_emb, diffusion_model.model_channels);
-        struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t);
+        struct ggml_tensor *out = ggml_dup_tensor(work_ctx, x_t);
         diffusion_model.alloc_compute_buffer(x_t, c, t_emb);
         diffusion_model.compute(out, n_threads, x_t, NULL, c, t_emb);
         diffusion_model.free_compute_buffer();
 
         double result = 0.f;
         {
-            float* vec_x   = (float*)x_t->data;
-            float* vec_out = (float*)out->data;
+            float *vec_x = (float *) x_t->data;
+            float *vec_out = (float *) out->data;
 
             int64_t n = ggml_nelements(out);
 
             for (int i = 0; i < n; i++) {
-                result += ((double)vec_out[i] - (double)vec_x[i]);
+                result += ((double) vec_out[i] - (double) vec_x[i]);
             }
             result /= n;
         }
@@ -568,15 +594,15 @@ class StableDiffusionGGML {
 
         for (int i = 0; i < TIMESTEPS; i++) {
             denoiser->schedule->alphas_cumprod[i] = alphas_cumprod_tensor[i];
-            denoiser->schedule->sigmas[i]         = std::sqrt(
-                (1 - denoiser->schedule->alphas_cumprod[i]) / denoiser->schedule->alphas_cumprod[i]);
+            denoiser->schedule->sigmas[i] = std::sqrt(
+                    (1 - denoiser->schedule->alphas_cumprod[i]) / denoiser->schedule->alphas_cumprod[i]);
             denoiser->schedule->log_sigmas[i] = std::log(denoiser->schedule->sigmas[i]);
         }
     }
 
-    void apply_lora(const std::string& lora_name, float multiplier) {
-        int64_t t0                 = ggml_time_ms();
-        std::string st_file_path   = path_join(lora_model_dir, lora_name + ".safetensors");
+    void apply_lora(const std::string &lora_name, float multiplier) {
+        int64_t t0 = ggml_time_ms();
+        std::string st_file_path = path_join(lora_model_dir, lora_name + ".safetensors");
         std::string ckpt_file_path = path_join(lora_model_dir, lora_name + ".ckpt");
         std::string file_path;
         if (file_exists(st_file_path)) {
@@ -606,14 +632,14 @@ class StableDiffusionGGML {
                  (t1 - t0) * 1.0f / 1000);
     }
 
-    void apply_loras(const std::unordered_map<std::string, float>& lora_state) {
+    void apply_loras(const std::unordered_map<std::string, float> &lora_state) {
         if (lora_state.size() > 0 && model_data_type != GGML_TYPE_F16 && model_data_type != GGML_TYPE_F32) {
             LOG_WARN("In quantized models when applying LoRA, the images have poor quality.");
         }
         std::unordered_map<std::string, float> lora_state_diff;
-        for (auto& kv : lora_state) {
-            const std::string& lora_name = kv.first;
-            float multiplier             = kv.second;
+        for (auto &kv: lora_state) {
+            const std::string &lora_name = kv.first;
+            float multiplier = kv.second;
 
             if (curr_lora_state.find(lora_name) != curr_lora_state.end()) {
                 float curr_multiplier = curr_lora_state[lora_name];
@@ -626,35 +652,35 @@ class StableDiffusionGGML {
             }
         }
 
-        for (auto& kv : lora_state_diff) {
+        for (auto &kv: lora_state_diff) {
             apply_lora(kv.first, kv.second);
         }
 
         curr_lora_state = lora_state;
     }
 
-    std::pair<ggml_tensor*, ggml_tensor*> get_learned_condition(ggml_context* work_ctx,
-                                                                const std::string& text,
-                                                                int clip_skip,
-                                                                int width,
-                                                                int height,
-                                                                bool force_zero_embeddings = false) {
+    std::pair<ggml_tensor *, ggml_tensor *> get_learned_condition(ggml_context *work_ctx,
+                                                                  const std::string &text,
+                                                                  int clip_skip,
+                                                                  int width,
+                                                                  int height,
+                                                                  bool force_zero_embeddings = false) {
         cond_stage_model.set_clip_skip(clip_skip);
-        auto tokens_and_weights     = cond_stage_model.tokenize(text, true);
-        std::vector<int>& tokens    = tokens_and_weights.first;
-        std::vector<float>& weights = tokens_and_weights.second;
-        int64_t t0                  = ggml_time_ms();
-        struct ggml_tensor* pooled  = NULL;
-        size_t total_hidden_size    = cond_stage_model.text_model.hidden_size;
+        auto tokens_and_weights = cond_stage_model.tokenize(text, true);
+        std::vector<int> &tokens = tokens_and_weights.first;
+        std::vector<float> &weights = tokens_and_weights.second;
+        int64_t t0 = ggml_time_ms();
+        struct ggml_tensor *pooled = NULL;
+        size_t total_hidden_size = cond_stage_model.text_model.hidden_size;
         if (version == VERSION_XL) {
             total_hidden_size += cond_stage_model.text_model2.hidden_size;
             pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, cond_stage_model.text_model2.projection_dim);
         }
-        struct ggml_tensor* hidden_states = ggml_new_tensor_2d(work_ctx,
+        struct ggml_tensor *hidden_states = ggml_new_tensor_2d(work_ctx,
                                                                GGML_TYPE_F32,
                                                                total_hidden_size,
                                                                cond_stage_model.text_model.max_position_embeddings);  // [N, n_token, hidden_size]
-        cond_stage_model.alloc_compute_buffer(work_ctx, (int)tokens.size());
+        cond_stage_model.alloc_compute_buffer(work_ctx, (int) tokens.size());
         cond_stage_model.compute(n_threads, tokens, hidden_states, pooled);
         cond_stage_model.free_compute_buffer();
         // if (pooled != NULL) {
@@ -664,7 +690,7 @@ class StableDiffusionGGML {
 
         int64_t t1 = ggml_time_ms();
         LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
-        ggml_tensor* result = ggml_dup_tensor(work_ctx, hidden_states);
+        ggml_tensor *result = ggml_dup_tensor(work_ctx, hidden_states);
         {
             float original_mean = ggml_tensor_mean(hidden_states);
             for (int i2 = 0; i2 < hidden_states->ne[2]; i2++) {
@@ -680,34 +706,34 @@ class StableDiffusionGGML {
             ggml_tensor_scale(result, (original_mean / new_mean));
         }
         if (force_zero_embeddings) {
-            float* vec = (float*)result->data;
+            float *vec = (float *) result->data;
             for (int i = 0; i < ggml_nelements(result); i++) {
                 vec[i] = 0;
             }
         }
 
-        ggml_tensor* vec = NULL;
+        ggml_tensor *vec = NULL;
         if (version == VERSION_XL) {
             int out_dim = 256;
-            vec         = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model.adm_in_channels);
+            vec = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model.adm_in_channels);
             // [0:1280]
             size_t offset = 0;
             memcpy(vec->data, pooled->data, ggml_nbytes(pooled));
             offset += ggml_nbytes(pooled);
 
-            struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 2);
+            struct ggml_tensor *timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 2);
             // original_size_as_tuple
-            float orig_width  = (float)width;
-            float orig_height = (float)height;
+            float orig_width = (float) width;
+            float orig_height = (float) height;
             ggml_tensor_set_f32(timesteps, orig_height, 0);
             ggml_tensor_set_f32(timesteps, orig_width, 1);
-            ggml_tensor* embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim,
+            ggml_tensor *embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim,
                                                    offset);
             offset += ggml_nbytes(embed_view);
             set_timestep_embedding(timesteps, embed_view, out_dim);
             // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
             // crop_coords_top_left
-            float crop_coord_top  = 0.f;
+            float crop_coord_top = 0.f;
             float crop_coord_left = 0.f;
             ggml_tensor_set_f32(timesteps, crop_coord_top, 0);
             ggml_tensor_set_f32(timesteps, crop_coord_left, 1);
@@ -716,8 +742,8 @@ class StableDiffusionGGML {
             set_timestep_embedding(timesteps, embed_view, out_dim);
             // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
             // target_size_as_tuple
-            float target_width  = (float)width;
-            float target_height = (float)height;
+            float target_width = (float) width;
+            float target_height = (float) height;
             ggml_tensor_set_f32(timesteps, target_height, 0);
             ggml_tensor_set_f32(timesteps, target_width, 1);
             embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset);
@@ -730,27 +756,27 @@ class StableDiffusionGGML {
         return {result, vec};
     }
 
-    ggml_tensor* sample(ggml_context* work_ctx,
-                        ggml_tensor* x_t,
-                        ggml_tensor* noise,
-                        ggml_tensor* c,
-                        ggml_tensor* c_vector,
-                        ggml_tensor* uc,
-                        ggml_tensor* uc_vector,
+    ggml_tensor *sample(ggml_context *work_ctx,
+                        ggml_tensor *x_t,
+                        ggml_tensor *noise,
+                        ggml_tensor *c,
+                        ggml_tensor *c_vector,
+                        ggml_tensor *uc,
+                        ggml_tensor *uc_vector,
                         float cfg_scale,
                         sample_method_t method,
-                        const std::vector<float>& sigmas) {
+                        const std::vector<float> &sigmas) {
         size_t steps = sigmas.size() - 1;
         // x_t = load_tensor_from_file(work_ctx, "./rand0.bin");
         // print_ggml_tensor(x_t);
-        struct ggml_tensor* x = ggml_dup_tensor(work_ctx, x_t);
+        struct ggml_tensor *x = ggml_dup_tensor(work_ctx, x_t);
         copy_ggml_tensor(x, x_t);
 
-        struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, x_t);
-        struct ggml_tensor* timesteps    = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32,
-                                                              1);  // [N, ]
-        struct ggml_tensor* t_emb        = new_timestep_embedding(work_ctx, NULL, timesteps,
-                                                                  diffusion_model.model_channels);  // [N, model_channels]
+        struct ggml_tensor *noised_input = ggml_dup_tensor(work_ctx, x_t);
+        struct ggml_tensor *timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32,
+                                                           1);  // [N, ]
+        struct ggml_tensor *t_emb = new_timestep_embedding(work_ctx, NULL, timesteps,
+                                                           diffusion_model.model_channels);  // [N, model_channels]
         diffusion_model.alloc_compute_buffer(noised_input, c, t_emb, c_vector);
 
         bool has_unconditioned = cfg_scale != 1.0 && uc != NULL;
@@ -765,31 +791,31 @@ class StableDiffusionGGML {
         }
 
         // denoise wrapper
-        struct ggml_tensor* out_cond   = ggml_dup_tensor(work_ctx, x);
-        struct ggml_tensor* out_uncond = NULL;
+        struct ggml_tensor *out_cond = ggml_dup_tensor(work_ctx, x);
+        struct ggml_tensor *out_uncond = NULL;
         if (has_unconditioned) {
             out_uncond = ggml_dup_tensor(work_ctx, x);
         }
-        struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);
+        struct ggml_tensor *denoised = ggml_dup_tensor(work_ctx, x);
 
-        auto denoise = [&](ggml_tensor* input, float sigma, int step) {
+        auto denoise = [&](ggml_tensor *input, float sigma, int step) {
             if (step == 1) {
-                pretty_progress(0, (int)steps, 0);
+                pretty_progress(0, (int) steps, 0);
             }
             int64_t t0 = ggml_time_us();
 
-            float c_skip               = 1.0f;
-            float c_out                = 1.0f;
-            float c_in                 = 1.0f;
+            float c_skip = 1.0f;
+            float c_out = 1.0f;
+            float c_in = 1.0f;
             std::vector<float> scaling = denoiser->get_scalings(sigma);
 
             if (scaling.size() == 3) {  // CompVisVDenoiser
                 c_skip = scaling[0];
-                c_out  = scaling[1];
-                c_in   = scaling[2];
+                c_out = scaling[1];
+                c_in = scaling[2];
             } else {  // CompVisDenoiser
                 c_out = scaling[0];
-                c_in  = scaling[1];
+                c_in = scaling[1];
             }
 
             float t = denoiser->schedule->sigma_to_t(sigma);
@@ -803,16 +829,16 @@ class StableDiffusionGGML {
             // cond
             diffusion_model.compute(out_cond, n_threads, noised_input, NULL, c, t_emb, c_vector);
 
-            float* negative_data = NULL;
+            float *negative_data = NULL;
             if (has_unconditioned) {
                 // uncond
                 diffusion_model.compute(out_uncond, n_threads, noised_input, NULL, uc, t_emb, uc_vector);
-                negative_data = (float*)out_uncond->data;
+                negative_data = (float *) out_uncond->data;
             }
-            float* vec_denoised  = (float*)denoised->data;
-            float* vec_input     = (float*)input->data;
-            float* positive_data = (float*)out_cond->data;
-            int ne_elements      = (int)ggml_nelements(denoised);
+            float *vec_denoised = (float *) denoised->data;
+            float *vec_input = (float *) input->data;
+            float *positive_data = (float *) out_cond->data;
+            int ne_elements = (int) ggml_nelements(denoised);
             for (int i = 0; i < ne_elements; i++) {
                 float latent_result = positive_data[i];
                 if (has_unconditioned) {
@@ -825,7 +851,7 @@ class StableDiffusionGGML {
             }
             int64_t t1 = ggml_time_us();
             if (step > 0) {
-                pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f);
+                pretty_progress(step, (int) steps, (t1 - t0) / 1000000.f);
                 // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
             }
         };
@@ -833,8 +859,8 @@ class StableDiffusionGGML {
         // sample_euler_ancestral
         switch (method) {
             case EULER_A: {
-                struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
-                struct ggml_tensor* d     = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x);
 
                 for (int i = 0; i < steps; i++) {
                     float sigma = sigmas[i];
@@ -844,9 +870,9 @@ class StableDiffusionGGML {
 
                     // d = (x - denoised) / sigma
                     {
-                        float* vec_d        = (float*)d->data;
-                        float* vec_x        = (float*)x->data;
-                        float* vec_denoised = (float*)denoised->data;
+                        float *vec_d = (float *) d->data;
+                        float *vec_x = (float *) x->data;
+                        float *vec_denoised = (float *) denoised->data;
 
                         for (int i = 0; i < ggml_nelements(d); i++) {
                             vec_d[i] = (vec_x[i] - vec_denoised[i]) / sigma;
@@ -854,18 +880,18 @@ class StableDiffusionGGML {
                     }
 
                     // get_ancestral_step
-                    float sigma_up   = std::min(sigmas[i + 1],
-                                                std::sqrt(sigmas[i + 1] * sigmas[i + 1] *
-                                                          (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) /
-                                                          (sigmas[i] * sigmas[i])));
+                    float sigma_up = std::min(sigmas[i + 1],
+                                              std::sqrt(sigmas[i + 1] * sigmas[i + 1] *
+                                                        (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) /
+                                                        (sigmas[i] * sigmas[i])));
                     float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up);
 
                     // Euler method
                     float dt = sigma_down - sigmas[i];
                     // x = x + d * dt
                     {
-                        float* vec_d = (float*)d->data;
-                        float* vec_x = (float*)x->data;
+                        float *vec_d = (float *) d->data;
+                        float *vec_x = (float *) x->data;
 
                         for (int i = 0; i < ggml_nelements(x); i++) {
                             vec_x[i] = vec_x[i] + vec_d[i] * dt;
@@ -877,8 +903,8 @@ class StableDiffusionGGML {
                         ggml_tensor_set_f32_randn(noise, rng);
                         // noise = load_tensor_from_file(work_ctx, "./rand" + std::to_string(i+1) + ".bin");
                         {
-                            float* vec_x     = (float*)x->data;
-                            float* vec_noise = (float*)noise->data;
+                            float *vec_x = (float *) x->data;
+                            float *vec_noise = (float *) noise->data;
 
                             for (int i = 0; i < ggml_nelements(x); i++) {
                                 vec_x[i] = vec_x[i] + vec_noise[i] * sigma_up;
@@ -886,10 +912,11 @@ class StableDiffusionGGML {
                         }
                     }
                 }
-            } break;
+            }
+                break;
             case EULER:  // Implemented without any sigma churn
             {
-                struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x);
 
                 for (int i = 0; i < steps; i++) {
                     float sigma = sigmas[i];
@@ -899,9 +926,9 @@ class StableDiffusionGGML {
 
                     // d = (x - denoised) / sigma
                     {
-                        float* vec_d        = (float*)d->data;
-                        float* vec_x        = (float*)x->data;
-                        float* vec_denoised = (float*)denoised->data;
+                        float *vec_d = (float *) d->data;
+                        float *vec_x = (float *) x->data;
+                        float *vec_denoised = (float *) denoised->data;
 
                         for (int j = 0; j < ggml_nelements(d); j++) {
                             vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigma;
@@ -911,18 +938,19 @@ class StableDiffusionGGML {
                     float dt = sigmas[i + 1] - sigma;
                     // x = x + d * dt
                     {
-                        float* vec_d = (float*)d->data;
-                        float* vec_x = (float*)x->data;
+                        float *vec_d = (float *) d->data;
+                        float *vec_x = (float *) x->data;
 
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             vec_x[j] = vec_x[j] + vec_d[j] * dt;
                         }
                     }
                 }
-            } break;
+            }
+                break;
             case HEUN: {
-                struct ggml_tensor* d  = ggml_dup_tensor(work_ctx, x);
-                struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor *x2 = ggml_dup_tensor(work_ctx, x);
 
                 for (int i = 0; i < steps; i++) {
                     // denoise
@@ -930,9 +958,9 @@ class StableDiffusionGGML {
 
                     // d = (x - denoised) / sigma
                     {
-                        float* vec_d        = (float*)d->data;
-                        float* vec_x        = (float*)x->data;
-                        float* vec_denoised = (float*)denoised->data;
+                        float *vec_d = (float *) d->data;
+                        float *vec_x = (float *) x->data;
+                        float *vec_denoised = (float *) denoised->data;
 
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i];
@@ -943,25 +971,25 @@ class StableDiffusionGGML {
                     if (sigmas[i + 1] == 0) {
                         // Euler step
                         // x = x + d * dt
-                        float* vec_d = (float*)d->data;
-                        float* vec_x = (float*)x->data;
+                        float *vec_d = (float *) d->data;
+                        float *vec_x = (float *) x->data;
 
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             vec_x[j] = vec_x[j] + vec_d[j] * dt;
                         }
                     } else {
                         // Heun step
-                        float* vec_d  = (float*)d->data;
-                        float* vec_d2 = (float*)d->data;
-                        float* vec_x  = (float*)x->data;
-                        float* vec_x2 = (float*)x2->data;
+                        float *vec_d = (float *) d->data;
+                        float *vec_d2 = (float *) d->data;
+                        float *vec_x = (float *) x->data;
+                        float *vec_x2 = (float *) x2->data;
 
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             vec_x2[j] = vec_x[j] + vec_d[j] * dt;
                         }
 
                         denoise(x2, sigmas[i + 1], i + 1);
-                        float* vec_denoised = (float*)denoised->data;
+                        float *vec_denoised = (float *) denoised->data;
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             float d2 = (vec_x2[j] - vec_denoised[j]) / sigmas[i + 1];
                             vec_d[j] = (vec_d[j] + d2) / 2;
@@ -969,10 +997,11 @@ class StableDiffusionGGML {
                         }
                     }
                 }
-            } break;
+            }
+                break;
             case DPM2: {
-                struct ggml_tensor* d  = ggml_dup_tensor(work_ctx, x);
-                struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor *x2 = ggml_dup_tensor(work_ctx, x);
 
                 for (int i = 0; i < steps; i++) {
                     // denoise
@@ -980,9 +1009,9 @@ class StableDiffusionGGML {
 
                     // d = (x - denoised) / sigma
                     {
-                        float* vec_d        = (float*)d->data;
-                        float* vec_x        = (float*)x->data;
-                        float* vec_denoised = (float*)denoised->data;
+                        float *vec_d = (float *) d->data;
+                        float *vec_x = (float *) x->data;
+                        float *vec_denoised = (float *) denoised->data;
 
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i];
@@ -992,9 +1021,9 @@ class StableDiffusionGGML {
                     if (sigmas[i + 1] == 0) {
                         // Euler step
                         // x = x + d * dt
-                        float dt     = sigmas[i + 1] - sigmas[i];
-                        float* vec_d = (float*)d->data;
-                        float* vec_x = (float*)x->data;
+                        float dt = sigmas[i + 1] - sigmas[i];
+                        float *vec_d = (float *) d->data;
+                        float *vec_x = (float *) x->data;
 
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             vec_x[j] = vec_x[j] + vec_d[j] * dt;
@@ -1002,18 +1031,18 @@ class StableDiffusionGGML {
                     } else {
                         // DPM-Solver-2
                         float sigma_mid = exp(0.5f * (log(sigmas[i]) + log(sigmas[i + 1])));
-                        float dt_1      = sigma_mid - sigmas[i];
-                        float dt_2      = sigmas[i + 1] - sigmas[i];
+                        float dt_1 = sigma_mid - sigmas[i];
+                        float dt_2 = sigmas[i + 1] - sigmas[i];
 
-                        float* vec_d  = (float*)d->data;
-                        float* vec_x  = (float*)x->data;
-                        float* vec_x2 = (float*)x2->data;
+                        float *vec_d = (float *) d->data;
+                        float *vec_x = (float *) x->data;
+                        float *vec_x2 = (float *) x2->data;
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             vec_x2[j] = vec_x[j] + vec_d[j] * dt_1;
                         }
 
                         denoise(x2, sigma_mid, i + 1);
-                        float* vec_denoised = (float*)denoised->data;
+                        float *vec_denoised = (float *) denoised->data;
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             float d2 = (vec_x2[j] - vec_denoised[j]) / sigma_mid;
                             vec_x[j] = vec_x[j] + d2 * dt_2;
@@ -1021,30 +1050,31 @@ class StableDiffusionGGML {
                     }
                 }
 
-            } break;
+            }
+                break;
             case DPMPP2S_A: {
-                struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
-                struct ggml_tensor* d     = ggml_dup_tensor(work_ctx, x);
-                struct ggml_tensor* x2    = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor *x2 = ggml_dup_tensor(work_ctx, x);
 
                 for (int i = 0; i < steps; i++) {
                     // denoise
                     denoise(x, sigmas[i], i + 1);
 
                     // get_ancestral_step
-                    float sigma_up   = std::min(sigmas[i + 1],
-                                                std::sqrt(sigmas[i + 1] * sigmas[i + 1] *
-                                                          (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) /
-                                                          (sigmas[i] * sigmas[i])));
+                    float sigma_up = std::min(sigmas[i + 1],
+                                              std::sqrt(sigmas[i + 1] * sigmas[i + 1] *
+                                                        (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) /
+                                                        (sigmas[i] * sigmas[i])));
                     float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up);
-                    auto t_fn        = [](float sigma) -> float { return -log(sigma); };
-                    auto sigma_fn    = [](float t) -> float { return exp(-t); };
+                    auto t_fn = [](float sigma) -> float { return -log(sigma); };
+                    auto sigma_fn = [](float t) -> float { return exp(-t); };
 
                     if (sigma_down == 0) {
                         // Euler step
-                        float* vec_d        = (float*)d->data;
-                        float* vec_x        = (float*)x->data;
-                        float* vec_denoised = (float*)denoised->data;
+                        float *vec_d = (float *) d->data;
+                        float *vec_x = (float *) x->data;
+                        float *vec_denoised = (float *) denoised->data;
 
                         for (int j = 0; j < ggml_nelements(d); j++) {
                             vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i];
@@ -1060,15 +1090,15 @@ class StableDiffusionGGML {
                         }
                     } else {
                         // DPM-Solver++(2S)
-                        float t      = t_fn(sigmas[i]);
+                        float t = t_fn(sigmas[i]);
                         float t_next = t_fn(sigma_down);
-                        float h      = t_next - t;
-                        float s      = t + 0.5f * h;
+                        float h = t_next - t;
+                        float s = t + 0.5f * h;
 
-                        float* vec_d        = (float*)d->data;
-                        float* vec_x        = (float*)x->data;
-                        float* vec_x2       = (float*)x2->data;
-                        float* vec_denoised = (float*)denoised->data;
+                        float *vec_d = (float *) d->data;
+                        float *vec_x = (float *) x->data;
+                        float *vec_x2 = (float *) x2->data;
+                        float *vec_denoised = (float *) denoised->data;
 
                         // First half-step
                         for (int j = 0; j < ggml_nelements(x); j++) {
@@ -1087,8 +1117,8 @@ class StableDiffusionGGML {
                     if (sigmas[i + 1] > 0) {
                         ggml_tensor_set_f32_randn(noise, rng);
                         {
-                            float* vec_x     = (float*)x->data;
-                            float* vec_noise = (float*)noise->data;
+                            float *vec_x = (float *) x->data;
+                            float *vec_noise = (float *) noise->data;
 
                             for (int i = 0; i < ggml_nelements(x); i++) {
                                 vec_x[i] = vec_x[i] + vec_noise[i] * sigma_up;
@@ -1096,10 +1126,11 @@ class StableDiffusionGGML {
                         }
                     }
                 }
-            } break;
+            }
+                break;
             case DPMPP2M:  // DPM++ (2M) from Karras et al (2022)
             {
-                struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor *old_denoised = ggml_dup_tensor(work_ctx, x);
 
                 auto t_fn = [](float sigma) -> float { return -log(sigma); };
 
@@ -1107,14 +1138,14 @@ class StableDiffusionGGML {
                     // denoise
                     denoise(x, sigmas[i], i + 1);
 
-                    float t                 = t_fn(sigmas[i]);
-                    float t_next            = t_fn(sigmas[i + 1]);
-                    float h                 = t_next - t;
-                    float a                 = sigmas[i + 1] / sigmas[i];
-                    float b                 = exp(-h) - 1.f;
-                    float* vec_x            = (float*)x->data;
-                    float* vec_denoised     = (float*)denoised->data;
-                    float* vec_old_denoised = (float*)old_denoised->data;
+                    float t = t_fn(sigmas[i]);
+                    float t_next = t_fn(sigmas[i + 1]);
+                    float h = t_next - t;
+                    float a = sigmas[i + 1] / sigmas[i];
+                    float b = exp(-h) - 1.f;
+                    float *vec_x = (float *) x->data;
+                    float *vec_denoised = (float *) denoised->data;
+                    float *vec_old_denoised = (float *) old_denoised->data;
 
                     if (i == 0 || sigmas[i + 1] == 0) {
                         // Simpler step for the edge cases
@@ -1123,10 +1154,10 @@ class StableDiffusionGGML {
                         }
                     } else {
                         float h_last = t - t_fn(sigmas[i - 1]);
-                        float r      = h_last / h;
+                        float r = h_last / h;
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             float denoised_d =
-                                (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j];
+                                    (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j];
                             vec_x[j] = a * vec_x[j] - b * denoised_d;
                         }
                     }
@@ -1136,10 +1167,11 @@ class StableDiffusionGGML {
                         vec_old_denoised[j] = vec_denoised[j];
                     }
                 }
-            } break;
+            }
+                break;
             case DPMPP2Mv2:  // Modified DPM++ (2M) from https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457
             {
-                struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor *old_denoised = ggml_dup_tensor(work_ctx, x);
 
                 auto t_fn = [](float sigma) -> float { return -log(sigma); };
 
@@ -1147,13 +1179,13 @@ class StableDiffusionGGML {
                     // denoise
                     denoise(x, sigmas[i], i + 1);
 
-                    float t                 = t_fn(sigmas[i]);
-                    float t_next            = t_fn(sigmas[i + 1]);
-                    float h                 = t_next - t;
-                    float a                 = sigmas[i + 1] / sigmas[i];
-                    float* vec_x            = (float*)x->data;
-                    float* vec_denoised     = (float*)denoised->data;
-                    float* vec_old_denoised = (float*)old_denoised->data;
+                    float t = t_fn(sigmas[i]);
+                    float t_next = t_fn(sigmas[i + 1]);
+                    float h = t_next - t;
+                    float a = sigmas[i + 1] / sigmas[i];
+                    float *vec_x = (float *) x->data;
+                    float *vec_denoised = (float *) denoised->data;
+                    float *vec_old_denoised = (float *) old_denoised->data;
 
                     if (i == 0 || sigmas[i + 1] == 0) {
                         // Simpler step for the edge cases
@@ -1163,14 +1195,14 @@ class StableDiffusionGGML {
                         }
                     } else {
                         float h_last = t - t_fn(sigmas[i - 1]);
-                        float h_min  = std::min(h_last, h);
-                        float h_max  = std::max(h_last, h);
-                        float r      = h_max / h_min;
-                        float h_d    = (h_max + h_min) / 2.f;
-                        float b      = exp(-h_d) - 1.f;
+                        float h_min = std::min(h_last, h);
+                        float h_max = std::max(h_last, h);
+                        float r = h_max / h_min;
+                        float h_d = (h_max + h_min) / 2.f;
+                        float b = exp(-h_d) - 1.f;
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             float denoised_d =
-                                (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j];
+                                    (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j];
                             vec_x[j] = a * vec_x[j] - b * denoised_d;
                         }
                     }
@@ -1180,11 +1212,12 @@ class StableDiffusionGGML {
                         vec_old_denoised[j] = vec_denoised[j];
                     }
                 }
-            } break;
+            }
+                break;
             case LCM:  // Latent Consistency Models
             {
-                struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
-                struct ggml_tensor* d     = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, x);
+                struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x);
 
                 for (int i = 0; i < steps; i++) {
                     float sigma = sigmas[i];
@@ -1194,8 +1227,8 @@ class StableDiffusionGGML {
 
                     // x = denoised
                     {
-                        float* vec_x        = (float*)x->data;
-                        float* vec_denoised = (float*)denoised->data;
+                        float *vec_x = (float *) x->data;
+                        float *vec_denoised = (float *) denoised->data;
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             vec_x[j] = vec_denoised[j];
                         }
@@ -1206,8 +1239,8 @@ class StableDiffusionGGML {
                         ggml_tensor_set_f32_randn(noise, rng);
                         // noise = load_tensor_from_file(res_ctx, "./rand" + std::to_string(i+1) + ".bin");
                         {
-                            float* vec_x     = (float*)x->data;
-                            float* vec_noise = (float*)noise->data;
+                            float *vec_x = (float *) x->data;
+                            float *vec_noise = (float *) noise->data;
 
                             for (int j = 0; j < ggml_nelements(x); j++) {
                                 vec_x[j] = vec_x[j] + sigmas[i + 1] * vec_noise[j];
@@ -1215,7 +1248,8 @@ class StableDiffusionGGML {
                         }
                     }
                 }
-            } break;
+            }
+                break;
 
             default:
                 LOG_ERROR("Attempting to sample with nonexisting sample method %i", method);
@@ -1226,28 +1260,28 @@ class StableDiffusionGGML {
     }
 
     // ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding
-    ggml_tensor* get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) {
+    ggml_tensor *get_first_stage_encoding(ggml_context *work_ctx, ggml_tensor *moments) {
         // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample
-        ggml_tensor* latent       = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1],
-                                                       moments->ne[2] / 2, moments->ne[3]);
-        struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latent);
+        ggml_tensor *latent = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1],
+                                                 moments->ne[2] / 2, moments->ne[3]);
+        struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, latent);
         ggml_tensor_set_f32_randn(noise, rng);
         // noise = load_tensor_from_file(work_ctx, "noise.bin");
         {
-            float mean   = 0;
+            float mean = 0;
             float logvar = 0;
-            float value  = 0;
-            float std_   = 0;
+            float value = 0;
+            float std_ = 0;
             for (int i = 0; i < latent->ne[3]; i++) {
                 for (int j = 0; j < latent->ne[2]; j++) {
                     for (int k = 0; k < latent->ne[1]; k++) {
                         for (int l = 0; l < latent->ne[0]; l++) {
-                            mean   = ggml_tensor_get_f32(moments, l, k, j, i);
-                            logvar = ggml_tensor_get_f32(moments, l, k, j + (int)latent->ne[2], i);
+                            mean = ggml_tensor_get_f32(moments, l, k, j, i);
+                            logvar = ggml_tensor_get_f32(moments, l, k, j + (int) latent->ne[2], i);
                             logvar = std::max(-30.0f, std::min(logvar, 20.0f));
-                            std_   = std::exp(0.5f * logvar);
-                            value  = mean + std_ * ggml_tensor_get_f32(noise, l, k, j, i);
-                            value  = value * scale_factor;
+                            std_ = std::exp(0.5f * logvar);
+                            value = mean + std_ * ggml_tensor_get_f32(noise, l, k, j, i);
+                            value = value * scale_factor;
                             // printf("%d %d %d %d -> %f\n", i, j, k, l, value);
                             ggml_tensor_set_f32(latent, value, l, k, j, i);
                         }
@@ -1258,14 +1292,14 @@ class StableDiffusionGGML {
         return latent;
     }
 
-    ggml_tensor* compute_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode) {
-        int64_t W           = x->ne[0];
-        int64_t H           = x->ne[1];
-        ggml_tensor* result = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32,
+    ggml_tensor *compute_first_stage(ggml_context *work_ctx, ggml_tensor *x, bool decode) {
+        int64_t W = x->ne[0];
+        int64_t H = x->ne[1];
+        ggml_tensor *result = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32,
                                                  decode ? (W * 8) : (W / 8),                    // width
                                                  decode ? (H * 8) : (H / 8),                    // height
                                                  decode ? 3 : (use_tiny_autoencoder ? 4 : 8));  // channels
-        int64_t t0          = ggml_time_ms();
+        int64_t t0 = ggml_time_ms();
         if (!use_tiny_autoencoder) {
             if (decode) {
                 ggml_tensor_scale(x, 1.0f / scale_factor);
@@ -1274,7 +1308,7 @@ class StableDiffusionGGML {
             }
             if (vae_tiling && decode) {  // TODO: support tiling vae encode
                 // split latent in 32x32 tiles and compute in several steps
-                auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
+                auto on_tiling = [&](ggml_tensor *in, ggml_tensor *out, bool init) {
                     if (init) {
                         first_stage_model.alloc_compute_buffer(in, decode);
                     } else {
@@ -1293,7 +1327,7 @@ class StableDiffusionGGML {
         } else {
             if (vae_tiling && decode) {  // TODO: support tiling vae encode
                 // split latent in 64x64 tiles and compute in several steps
-                auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
+                auto on_tiling = [&](ggml_tensor *in, ggml_tensor *out, bool init) {
                     if (init) {
                         tae_first_stage.alloc_compute_buffer(in, decode);
                     } else {
@@ -1316,11 +1350,11 @@ class StableDiffusionGGML {
         return result;
     }
 
-    ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x) {
+    ggml_tensor *encode_first_stage(ggml_context *work_ctx, ggml_tensor *x) {
         return compute_first_stage(work_ctx, x, false);
     }
 
-    ggml_tensor* decode_first_stage(ggml_context* work_ctx, ggml_tensor* x) {
+    ggml_tensor *decode_first_stage(ggml_context *work_ctx, ggml_tensor *x) {
         return compute_first_stage(work_ctx, x, true);
     }
 };
@@ -1328,19 +1362,19 @@ class StableDiffusionGGML {
 /*================================================= SD API ==================================================*/
 
 struct sd_ctx_t {
-    StableDiffusionGGML* sd = NULL;
+    StableDiffusionGGML *sd = NULL;
 };
 
-sd_ctx_t* new_sd_ctx(int n_threads,
+sd_ctx_t *new_sd_ctx(int n_threads,
                      bool vae_decode_only,
                      bool free_params_immediately,
-                     const char* lora_model_dir_c_str,
+                     const char *lora_model_dir_c_str,
                      enum rng_type_t rng_type,
                      bool vae_tiling,
                      enum sd_type_t wtype,
                      enum schedule_t s,
                      bool init_backend_immediately) {
-    sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t));
+    sd_ctx_t *sd_ctx = (sd_ctx_t *) malloc(sizeof(sd_ctx_t));
     if (sd_ctx == NULL) {
         return NULL;
     }
@@ -1358,7 +1392,7 @@ sd_ctx_t* new_sd_ctx(int n_threads,
     return sd_ctx;
 }
 
-void free_sd_ctx(sd_ctx_t* sd_ctx) {
+void free_sd_ctx(sd_ctx_t *sd_ctx) {
     if (sd_ctx->sd != NULL) {
         delete sd_ctx->sd;
         sd_ctx->sd = NULL;
@@ -1366,7 +1400,7 @@ void free_sd_ctx(sd_ctx_t* sd_ctx) {
     free(sd_ctx);
 }
 
-void init_backend(sd_ctx_t* sd_ctx) {
+void init_backend(sd_ctx_t *sd_ctx) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return;
@@ -1374,11 +1408,11 @@ void init_backend(sd_ctx_t* sd_ctx) {
     sd_ctx->sd->init_backend();
 }
 
-void set_options(sd_ctx_t* sd_ctx,
+void set_options(sd_ctx_t *sd_ctx,
                  int n_threads,
                  bool vae_decode_only,
                  bool free_params_immediately,
-                 const char* lora_model_dir,
+                 const char *lora_model_dir,
                  rng_type_t rng_type,
                  bool vae_tiling,
                  sd_type_t wtype,
@@ -1388,17 +1422,17 @@ void set_options(sd_ctx_t* sd_ctx,
         return;
     }
     sd_ctx->sd->set_options(
-        n_threads,
-        vae_decode_only,
-        free_params_immediately,
-        std::string(lora_model_dir),
-        rng_type,
-        vae_tiling,
-        wtype,
-        schedule);
+            n_threads,
+            vae_decode_only,
+            free_params_immediately,
+            std::string(lora_model_dir),
+            rng_type,
+            vae_tiling,
+            wtype,
+            schedule);
 }
 
-bool load_clip_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* prefix) {
+bool load_clip_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return false;
@@ -1406,7 +1440,7 @@ bool load_clip_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* p
     return sd_ctx->sd->load_clip_from_file(std::string(model_path), true, std::string(prefix));
 }
 
-void free_clip_params(sd_ctx_t* sd_ctx) {
+void free_clip_params(sd_ctx_t *sd_ctx) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return;
@@ -1414,7 +1448,7 @@ void free_clip_params(sd_ctx_t* sd_ctx) {
     sd_ctx->sd->free_clip_params();
 }
 
-bool load_unet_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* prefix) {
+bool load_unet_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return false;
@@ -1422,7 +1456,7 @@ bool load_unet_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* p
     return sd_ctx->sd->load_unet_from_file(std::string(model_path), true, std::string(prefix));
 }
 
-void free_unet_params(sd_ctx_t* sd_ctx) {
+void free_unet_params(sd_ctx_t *sd_ctx) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return;
@@ -1430,7 +1464,7 @@ void free_unet_params(sd_ctx_t* sd_ctx) {
     sd_ctx->sd->free_unet_params();
 }
 
-bool load_vae_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* prefix) {
+bool load_vae_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return false;
@@ -1438,7 +1472,7 @@ bool load_vae_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* pr
     return sd_ctx->sd->load_vae_from_file(std::string(model_path), true, std::string(prefix));
 }
 
-void free_vae_params(sd_ctx_t* sd_ctx) {
+void free_vae_params(sd_ctx_t *sd_ctx) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return;
@@ -1446,7 +1480,7 @@ void free_vae_params(sd_ctx_t* sd_ctx) {
     sd_ctx->sd->free_vae_params();
 }
 
-bool load_taesd_from_file(sd_ctx_t* sd_ctx, const char* model_path) {
+bool load_taesd_from_file(sd_ctx_t *sd_ctx, const char *model_path) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return false;
@@ -1454,7 +1488,7 @@ bool load_taesd_from_file(sd_ctx_t* sd_ctx, const char* model_path) {
     return sd_ctx->sd->load_taesd_from_file(std::string(model_path));
 }
 
-void free_taesd_params(sd_ctx_t* sd_ctx) {
+void free_taesd_params(sd_ctx_t *sd_ctx) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return;
@@ -1463,7 +1497,7 @@ void free_taesd_params(sd_ctx_t* sd_ctx) {
 }
 
 // load all model from one file
-bool load_diffusions_from_file(sd_ctx_t* sd_ctx, const char* model_path) {
+bool load_diffusions_from_file(sd_ctx_t *sd_ctx, const char *model_path) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return false;
@@ -1472,7 +1506,7 @@ bool load_diffusions_from_file(sd_ctx_t* sd_ctx, const char* model_path) {
 }
 
 // free all model from one file
-void free_diffusions_params(sd_ctx_t* sd_ctx) {
+void free_diffusions_params(sd_ctx_t *sd_ctx) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return;
@@ -1480,9 +1514,9 @@ void free_diffusions_params(sd_ctx_t* sd_ctx) {
     return sd_ctx->sd->free_diffusions_params();
 }
 
-sd_image_t* txt2img(sd_ctx_t* sd_ctx,
-                    const char* prompt_c_str,
-                    const char* negative_prompt_c_str,
+sd_image_t *txt2img(sd_ctx_t *sd_ctx,
+                    const char *prompt_c_str,
+                    const char *negative_prompt_c_str,
                     int clip_skip,
                     float cfg_scale,
                     int width,
@@ -1500,10 +1534,10 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
     std::string negative_prompt(negative_prompt_c_str);
 
     // extract and remove lora
-    auto result_pair                                = extract_and_remove_lora(prompt);
+    auto result_pair = extract_and_remove_lora(prompt);
     std::unordered_map<std::string, float> lora_f2m = result_pair.first;  // lora_name -> multiplier
 
-    for (auto& kv : lora_f2m) {
+    for (auto &kv: lora_f2m) {
         LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second);
     }
 
@@ -1519,10 +1553,10 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
     params.mem_size += width * height * 3 * sizeof(float);
     params.mem_size *= batch_count;
     params.mem_buffer = NULL;
-    params.no_alloc   = false;
+    params.no_alloc = false;
     // LOG_DEBUG("mem_size %u ", params.mem_size);
 
-    struct ggml_context* work_ctx = ggml_init(params);
+    struct ggml_context *work_ctx = ggml_init(params);
     if (!work_ctx) {
         LOG_ERROR("ggml_init() failed");
         return NULL;
@@ -1532,16 +1566,16 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
         // Generally, when using the provided command line, the seed is always >0.
         // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
         // by a third party with a seed <0, let's incorporate randomization here.
-        srand((int)time(NULL));
+        srand((int) time(NULL));
         seed = rand();
     }
 
-    t0                            = ggml_time_ms();
-    auto cond_pair                = sd_ctx->sd->get_learned_condition(work_ctx, prompt, clip_skip, width, height);
-    ggml_tensor* c                = cond_pair.first;
-    ggml_tensor* c_vector         = cond_pair.second;  // [adm_in_channels, ]
-    struct ggml_tensor* uc        = NULL;
-    struct ggml_tensor* uc_vector = NULL;
+    t0 = ggml_time_ms();
+    auto cond_pair = sd_ctx->sd->get_learned_condition(work_ctx, prompt, clip_skip, width, height);
+    ggml_tensor *c = cond_pair.first;
+    ggml_tensor *c_vector = cond_pair.second;  // [adm_in_channels, ]
+    struct ggml_tensor *uc = NULL;
+    struct ggml_tensor *uc_vector = NULL;
     if (cfg_scale != 1.0) {
         bool force_zero_embeddings = false;
         if (sd_ctx->sd->version == VERSION_XL && negative_prompt.size() == 0) {
@@ -1549,8 +1583,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
         }
         auto uncond_pair = sd_ctx->sd->get_learned_condition(work_ctx, negative_prompt, clip_skip, width, height,
                                                              force_zero_embeddings);
-        uc               = uncond_pair.first;
-        uc_vector        = uncond_pair.second;  // [adm_in_channels, ]
+        uc = uncond_pair.first;
+        uc_vector = uncond_pair.second;  // [adm_in_channels, ]
     }
     t1 = ggml_time_ms();
     LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t1 - t0);
@@ -1559,23 +1593,23 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
         sd_ctx->sd->cond_stage_model.free_params_buffer();
     }
 
-    std::vector<struct ggml_tensor*> final_latents;  // collect latents to decode
+    std::vector<struct ggml_tensor *> final_latents;  // collect latents to decode
     int C = 4;
     int W = width / 8;
     int H = height / 8;
     LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
     for (int b = 0; b < batch_count; b++) {
         int64_t sampling_start = ggml_time_ms();
-        int64_t cur_seed       = seed + b;
+        int64_t cur_seed = seed + b;
         LOG_INFO("generating image: %i/%i - seed %i", b + 1, batch_count, cur_seed);
 
         sd_ctx->sd->rng->manual_seed(cur_seed);
-        struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
+        struct ggml_tensor *x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
         ggml_tensor_set_f32_randn(x_t, sd_ctx->sd->rng);
 
         std::vector<float> sigmas = sd_ctx->sd->denoiser->schedule->get_sigmas(sample_steps);
 
-        struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx, x_t, NULL, c, c_vector, uc, uc_vector, cfg_scale,
+        struct ggml_tensor *x_0 = sd_ctx->sd->sample(work_ctx, x_t, NULL, c, c_vector, uc, uc_vector, cfg_scale,
                                                      sample_method, sigmas);
         // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
         // print_ggml_tensor(x_0);
@@ -1592,10 +1626,10 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
              (t3 - t1) * 1.0f / 1000);
 
     LOG_INFO("decoding %zu latents", final_latents.size());
-    std::vector<struct ggml_tensor*> decoded_images;  // collect decoded images
+    std::vector<struct ggml_tensor *> decoded_images;  // collect decoded images
     for (size_t i = 0; i < final_latents.size(); i++) {
-        t1                      = ggml_time_ms();
-        struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, final_latents[i] /* x_0 */);
+        t1 = ggml_time_ms();
+        struct ggml_tensor *img = sd_ctx->sd->decode_first_stage(work_ctx, final_latents[i] /* x_0 */);
         // print_ggml_tensor(img);
         if (img != NULL) {
             decoded_images.push_back(img);
@@ -1609,30 +1643,30 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
     if (sd_ctx->sd->free_params_immediately && !sd_ctx->sd->use_tiny_autoencoder) {
         sd_ctx->sd->first_stage_model.free_params_buffer();
     }
-    sd_image_t* result_images = (sd_image_t*)calloc(batch_count, sizeof(sd_image_t));
+    sd_image_t *result_images = (sd_image_t *) calloc(batch_count, sizeof(sd_image_t));
     if (result_images == NULL) {
         ggml_free(work_ctx);
         return NULL;
     }
 
     for (size_t i = 0; i < decoded_images.size(); i++) {
-        result_images[i].width   = width;
-        result_images[i].height  = height;
+        result_images[i].width = width;
+        result_images[i].height = height;
         result_images[i].channel = 3;
-        result_images[i].data    = sd_tensor_to_image(decoded_images[i]);
+        result_images[i].data = sd_tensor_to_image(decoded_images[i]);
     }
     ggml_free(work_ctx);
     LOG_INFO(
-        "txt2img completed in %.2fs",
-        (t4 - t0) * 1.0f / 1000);
+            "txt2img completed in %.2fs",
+            (t4 - t0) * 1.0f / 1000);
 
     return result_images;
 }
 
-sd_image_t* img2img(sd_ctx_t* sd_ctx,
+sd_image_t *img2img(sd_ctx_t *sd_ctx,
                     sd_image_t init_image,
-                    const char* prompt_c_str,
-                    const char* negative_prompt_c_str,
+                    const char *prompt_c_str,
+                    const char *negative_prompt_c_str,
                     int clip_skip,
                     float cfg_scale,
                     int width,
@@ -1651,7 +1685,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
     LOG_INFO("img2img %dx%d", width, height);
 
     std::vector<float> sigmas = sd_ctx->sd->denoiser->schedule->get_sigmas(sample_steps);
-    size_t t_enc              = static_cast<size_t>(sample_steps * strength);
+    size_t t_enc = static_cast<size_t>(sample_steps * strength);
     LOG_INFO("target t_enc is %zu steps", t_enc);
     std::vector<float> sigma_sched;
     sigma_sched.assign(sigmas.begin() + sample_steps - t_enc - 1, sigmas.end());
@@ -1660,26 +1694,26 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
     params.mem_size = static_cast<size_t>(10 * 1024) * 1024;  // 10 MB
     params.mem_size += width * height * 3 * sizeof(float) * 2;
     params.mem_buffer = NULL;
-    params.no_alloc   = false;
+    params.no_alloc = false;
     // LOG_DEBUG("mem_size %u ", params.mem_size);
 
     // draft context
-    struct ggml_context* work_ctx = ggml_init(params);
+    struct ggml_context *work_ctx = ggml_init(params);
     if (!work_ctx) {
         LOG_ERROR("ggml_init() failed");
         return NULL;
     }
 
     if (seed < 0) {
-        seed = (int)time(NULL);
+        seed = (int) time(NULL);
     }
 
     sd_ctx->sd->rng->manual_seed(seed);
 
     // extract and remove lora
-    auto result_pair                                = extract_and_remove_lora(prompt);
+    auto result_pair = extract_and_remove_lora(prompt);
     std::unordered_map<std::string, float> lora_f2m = result_pair.first;  // lora_name -> multiplier
-    for (auto& kv : lora_f2m) {
+    for (auto &kv: lora_f2m) {
         LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second);
     }
     prompt = result_pair.second;
@@ -1691,13 +1725,13 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
     int64_t t1 = ggml_time_ms();
     LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
 
-    ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
+    ggml_tensor *init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
     sd_image_to_tensor(init_image.data, init_img);
-    t0                       = ggml_time_ms();
-    ggml_tensor* init_latent = NULL;
+    t0 = ggml_time_ms();
+    ggml_tensor *init_latent = NULL;
     if (!sd_ctx->sd->use_tiny_autoencoder) {
-        ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
-        init_latent          = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
+        ggml_tensor *moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
+        init_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
     } else {
         init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
     }
@@ -1705,11 +1739,11 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
     t1 = ggml_time_ms();
     LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
 
-    auto cond_pair                = sd_ctx->sd->get_learned_condition(work_ctx, prompt, clip_skip, width, height);
-    ggml_tensor* c                = cond_pair.first;
-    ggml_tensor* c_vector         = cond_pair.second;  // [adm_in_channels, ]
-    struct ggml_tensor* uc        = NULL;
-    struct ggml_tensor* uc_vector = NULL;
+    auto cond_pair = sd_ctx->sd->get_learned_condition(work_ctx, prompt, clip_skip, width, height);
+    ggml_tensor *c = cond_pair.first;
+    ggml_tensor *c_vector = cond_pair.second;  // [adm_in_channels, ]
+    struct ggml_tensor *uc = NULL;
+    struct ggml_tensor *uc_vector = NULL;
     if (cfg_scale != 1.0) {
         bool force_zero_embeddings = false;
         if (sd_ctx->sd->version == VERSION_XL && negative_prompt.size() == 0) {
@@ -1717,8 +1751,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
         }
         auto uncond_pair = sd_ctx->sd->get_learned_condition(work_ctx, negative_prompt, clip_skip, width, height,
                                                              force_zero_embeddings);
-        uc               = uncond_pair.first;
-        uc_vector        = uncond_pair.second;  // [adm_in_channels, ]
+        uc = uncond_pair.first;
+        uc_vector = uncond_pair.second;  // [adm_in_channels, ]
     }
     int64_t t2 = ggml_time_ms();
     LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t2 - t1);
@@ -1727,11 +1761,11 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
     }
 
     sd_ctx->sd->rng->manual_seed(seed);
-    struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, init_latent);
+    struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, init_latent);
     ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng);
 
     LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
-    struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx, init_latent, noise, c, c_vector, uc, uc_vector,
+    struct ggml_tensor *x_0 = sd_ctx->sd->sample(work_ctx, init_latent, noise, c, c_vector, uc, uc_vector,
                                                  cfg_scale, sample_method, sigma_sched);
     // struct ggml_tensor *x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
     // print_ggml_tensor(x_0);
@@ -1741,7 +1775,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
         sd_ctx->sd->diffusion_model.free_params_buffer();
     }
 
-    struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, x_0);
+    struct ggml_tensor *img = sd_ctx->sd->decode_first_stage(work_ctx, x_0);
     if (sd_ctx->sd->free_params_immediately && !sd_ctx->sd->use_tiny_autoencoder) {
         sd_ctx->sd->first_stage_model.free_params_buffer();
     }
@@ -1750,17 +1784,17 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
         return NULL;
     }
 
-    sd_image_t* result_images = (sd_image_t*)calloc(1, sizeof(sd_image_t));
+    sd_image_t *result_images = (sd_image_t *) calloc(1, sizeof(sd_image_t));
     if (result_images == NULL) {
         ggml_free(work_ctx);
         return NULL;
     }
 
     for (size_t i = 0; i < 1; i++) {
-        result_images[i].width   = width;
-        result_images[i].height  = height;
+        result_images[i].width = width;
+        result_images[i].height = height;
         result_images[i].channel = 3;
-        result_images[i].data    = sd_tensor_to_image(img);
+        result_images[i].data = sd_tensor_to_image(img);
     }
     ggml_free(work_ctx);
 

From 18aec69789427eee04a1d5b55f43d0d5d9da2734 Mon Sep 17 00:00:00 2001
From: Cyberhan123 <255542417@qq.com>
Date: Fri, 26 Jan 2024 18:02:16 +0800
Subject: [PATCH 4/8] full ci

--todo reload model
---
 examples/cli/main.cpp | 329 ++++++++++++++++++++++++++++++++++--------
 stable-diffusion.cpp  |  24 ++-
 2 files changed, 292 insertions(+), 61 deletions(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 31893751..6ab638e5 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -6,6 +6,7 @@
 #include <string>
 #include <vector>
 #include <sstream>
+#include <set>
 
 #include "stable-diffusion.h"
 
@@ -408,67 +409,88 @@ void parse_args(int argc, const char **argv, SDParams &params) {
         print_usage(argc, argv);
         exit(1);
     }
+}
+
+bool check_params(SDParams params) {
+    std::vector<std::string> required_args;
+    std::vector<std::string> invalid_args;
 
     if (params.n_threads <= 0) {
         params.n_threads = get_num_physical_cores();
     }
 
-    if (params.mode != STREAM) {
-        if (params.mode != CONVERT && params.prompt.length() == 0) {
-            fprintf(stderr, "error: the following arguments are required: prompt\n");
-            print_usage(argc, argv);
-            exit(1);
-        }
+    if (params.mode != CONVERT && params.prompt.length() == 0) {
+        required_args.emplace_back("prompt");
+    }
 
-        if (params.model_path.length() == 0) {
-            fprintf(stderr, "error: the following arguments are required: model_path\n");
-            print_usage(argc, argv);
-            exit(1);
-        }
+    if (params.model_path.length() == 0) {
+        required_args.emplace_back("model_path");
+    }
 
-        if (params.mode == IMG2IMG && params.input_path.length() == 0) {
-            fprintf(stderr, "error: when using the img2img mode, the following arguments are required: init-img\n");
-            print_usage(argc, argv);
-            exit(1);
-        }
+    if (params.mode == IMG2IMG && params.input_path.length() == 0) {
+        required_args.emplace_back("init-img");
+    }
 
-        if (params.output_path.length() == 0) {
-            fprintf(stderr, "error: the following arguments are required: output_path\n");
-            print_usage(argc, argv);
-            exit(1);
-        }
+    if (params.output_path.length() == 0) {
+        required_args.emplace_back("output_path");
+    }
 
-        if (params.width <= 0 || params.width % 64 != 0) {
-            fprintf(stderr, "error: the width must be a multiple of 64\n");
-            exit(1);
-        }
+    if (params.width <= 0 || params.width % 64 != 0) {
+        invalid_args.emplace_back("the width must be a multiple of 64");
+    }
 
-        if (params.height <= 0 || params.height % 64 != 0) {
-            fprintf(stderr, "error: the height must be a multiple of 64\n");
-            exit(1);
-        }
+    if (params.height <= 0 || params.height % 64 != 0) {
+        invalid_args.emplace_back("the height must be a multiple of 64");
+    }
 
-        if (params.sample_steps <= 0) {
-            fprintf(stderr, "error: the sample_steps must be greater than 0\n");
-            exit(1);
-        }
+    if (params.sample_steps <= 0) {
+        invalid_args.emplace_back("the sample_steps must be greater than 0");
+    }
 
-        if (params.strength < 0.f || params.strength > 1.f) {
-            fprintf(stderr, "error: can only work with strength in [0.0, 1.0]\n");
-            exit(1);
+    if (params.strength < 0.f || params.strength > 1.f) {
+        invalid_args.emplace_back("can only work with strength in [0.0, 1.0]");
+    }
+
+    if (params.seed < 0) {
+        srand((int) time(NULL));
+        params.seed = rand();
+    }
+
+    if (params.mode == CONVERT) {
+        if (params.output_path == "output.png") {
+            params.output_path = "output.gguf";
         }
+    }
 
-        if (params.seed < 0) {
-            srand((int) time(NULL));
-            params.seed = rand();
+    if ((!invalid_args.empty()) || (!required_args.empty())) {
+        if (!invalid_args.empty()) {
+            std::ostringstream oss;
+            for (int i = 0; i < invalid_args.size(); i++) {
+                if (i > 0) {
+                    oss << ",\n";
+                }
+                oss << invalid_args[i];
+            }
+            std::string invalid_args_str = oss.str();
+            std::cout << "error: " << invalid_args_str << std::endl;
         }
 
-        if (params.mode == CONVERT) {
-            if (params.output_path == "output.png") {
-                params.output_path = "output.gguf";
+        if (!required_args.empty()) {
+            std::ostringstream oss;
+            for (int i = 0; i < required_args.size(); i++) {
+                if (i > 0) {
+                    oss << ",";
+                }
+                oss << required_args[i];
             }
+            std::string required_args_str = oss.str();
+            std::cout << "require: " << required_args_str << std::endl;
         }
+
+        return false;
     }
+
+    return true;
 }
 
 std::string get_image_params(SDParams params, int64_t seed) {
@@ -505,20 +527,38 @@ void sd_log_cb(enum sd_log_level_t level, const char *log, void *data) {
     }
 }
 
-std::vector<std::string> parse_cin(std::string &input, std::vector<std::string> ignore_args) {
+std::vector<std::string> parse_cin(std::string &input, std::set<std::string> ignore_args) {
     std::vector<std::string> inputTokens;
     std::string token;
     std::istringstream iss(input);
 
     std::string word;
+    bool in_stmt = false;
+    std::string stmt;
+    inputTokens.emplace_back("fake run path, no use!");
     while (iss >> word) {
+        if (word[0] == '"') {
+            in_stmt = true;
+        }
+
+        if (word[word.length() - 1] == '"') {
+            stmt += word;
+            word = stmt.substr(1, stmt.length() - 2);
+            stmt = "";
+            in_stmt = false;
+        }
+
+        if (in_stmt) {
+            stmt += word;
+            stmt += " ";
+            continue;
+        }
         inputTokens.push_back(word);
     }
 
     std::vector<std::string> commands;
     for (int i = 0; i < inputTokens.size(); i++) {
-
-        if (std::find(ignore_args.begin(), ignore_args.end(), inputTokens[i]) != ignore_args.end()) {
+        if (ignore_args.find(inputTokens[i]) != ignore_args.end()) {
             i++;
             continue;
         }
@@ -527,6 +567,128 @@ std::vector<std::string> parse_cin(std::string &input, std::vector<std::string>
     return commands;
 }
 
+SDParams merge_params(SDParams dst, SDParams src) {
+    if (dst.n_threads != src.n_threads) {
+        if (src.n_threads > 0) {
+            dst.n_threads = src.n_threads;
+        }
+    }
+
+    if (dst.mode != src.mode) {
+        if (src.mode == TXT2IMG || src.mode == IMG2IMG) {
+            dst.mode = src.mode;
+            if (dst.mode == IMG2IMG) {
+                dst.vae_decode_only = false;
+            }
+        }
+    }
+
+    if (dst.model_path != src.model_path) {
+        if (!src.model_path.empty()) {
+            dst.model_path = src.model_path;
+        }
+    }
+
+    if (dst.vae_path != src.vae_path) {
+        if (!src.vae_path.empty()) {
+            dst.vae_path = src.vae_path;
+        }
+    }
+
+    if (dst.clip_path != src.clip_path) {
+        if (!src.clip_path.empty()) {
+            dst.clip_path = src.clip_path;
+        }
+    }
+
+    if (dst.unet_path != src.unet_path) {
+        if (!src.unet_path.empty()) {
+            dst.unet_path = src.unet_path;
+        }
+    }
+
+    if (dst.taesd_path != src.taesd_path) {
+        if (!src.taesd_path.empty()) {
+            dst.taesd_path = src.taesd_path;
+        }
+    }
+
+    if (dst.esrgan_path != src.esrgan_path) {
+        if (!src.esrgan_path.empty()) {
+            dst.esrgan_path = src.esrgan_path;
+        }
+    }
+
+    if (dst.wtype != src.wtype) {
+        dst.wtype = src.wtype;
+    }
+
+    if (dst.lora_model_dir != src.lora_model_dir) {
+        if (!src.lora_model_dir.empty()) {
+            dst.lora_model_dir = src.lora_model_dir;
+        }
+    }
+
+    if (dst.output_path != src.output_path) {
+        if (!src.output_path.empty()) {
+            dst.output_path = src.output_path;
+        }
+    }
+
+    if (dst.prompt != src.prompt) {
+        if (!src.prompt.empty()) {
+            dst.prompt = src.prompt;
+        }
+    }
+
+    if (dst.negative_prompt != src.negative_prompt) {
+        if (!src.negative_prompt.empty()) {
+            dst.negative_prompt = src.negative_prompt;
+        }
+    }
+
+    if (dst.cfg_scale != src.cfg_scale) {
+        if (src.cfg_scale >= 0) {
+            dst.cfg_scale = src.cfg_scale;
+        }
+    }
+
+    if (dst.clip_skip != src.clip_skip) {
+        dst.clip_skip = src.clip_skip;
+    }
+
+    if (dst.width != src.width) {
+        if (src.width > 0 || src.width % 64 == 0) {
+            dst.width = src.width;
+        }
+    }
+
+    if (dst.height != src.height) {
+        if (src.height > 0 || src.height % 64 == 0) {
+            dst.height = src.height;
+        }
+    }
+
+    if (dst.sample_steps != src.sample_steps) {
+        if (src.sample_steps > 0) {
+            dst.sample_steps = src.sample_steps;
+        }
+    }
+
+    if (dst.strength != src.strength) {
+        if (src.strength >= 0.f && src.strength <= 1.f) {
+            dst.strength = src.strength;
+        }
+    }
+
+    if (dst.seed != src.seed) {
+        if (src.seed > 0) {
+            dst.seed = src.seed;
+        }
+    }
+    return dst;
+}
+
 class CliInstance {
 public:
     sd_ctx_t *sd_ctx;
@@ -548,7 +710,28 @@ class CliInstance {
                 true);
     }
 
-    //TODO: dynamic load model
+    bool load_from_file(SDParams &params) {
+        // free api always check if the following methods can free, so we can always free the model before load it.
+        free_diffusions_params(sd_ctx);
+        auto load_status = load_diffusions_from_file(sd_ctx, params.model_path.c_str());
+
+        if (load_status && !params.clip_path.empty()) {
+            free_clip_params(sd_ctx);
+            load_status = load_clip_from_file(sd_ctx, params.clip_path.c_str());
+        }
+
+        if (load_status && !params.vae_path.empty()) {
+            free_vae_params(sd_ctx);
+            load_status = load_vae_from_file(sd_ctx, params.vae_path.c_str());
+        }
+
+        if (load_status && !params.unet_path.empty()) {
+            free_unet_params(sd_ctx);
+            load_status = load_unet_from_file(sd_ctx, params.unet_path.c_str());
+        }
+
+        return load_status;
+    }
 
     void txtimg(SDParams &params) {
         set_options(sd_ctx, params.n_threads,
@@ -682,8 +865,13 @@ class CliInstance {
 
 int main(int argc, const char *argv[]) {
     SDParams params;
+
     parse_args(argc, argv, params);
 
+    if (params.mode != STREAM && !check_params(params)) {
+        return 1;
+    }
+
     sd_set_log_callback(sd_log_cb, (void *) &params);
 
     if (params.verbose) {
@@ -713,34 +901,61 @@ int main(int argc, const char *argv[]) {
     }
 
     auto instance = new CliInstance(params);
+
     if (params.mode == STREAM) {
+        std::cout << "you are in stream model, feel free to use txt2img or img2img" << std::endl;
         while (true) {
-            std::cout << "you are in stream model, take free to use txt2img or img2img" << std::endl;
             std::string input;
+            std::cout << "please input args: " << std::endl;
             std::getline(std::cin, input);
-            std::vector<std::string> ignore_cmd = {""};
-            auto args = parse_cin(input, ignore_cmd);
+            //hold an ignore cmd for feature to ignore the cmd not support
+            std::set<std::string> ignore_cmd = {""};
+            std::vector<std::string> args = parse_cin(input, ignore_cmd);
             SDParams stream_params;
             const char **args_c_arr = new const char *[args.size()];
-            for (int i = 0; i < args.size(); ++i) {
-                args_c_arr[i] = args[i].c_str();
+            for (int i = 0; i < args.size(); i++) {
+                std::string arg = args[i];
+                char *c_str = new char[args[i].length() + 1];
+                std::strcpy(c_str, arg.c_str());
+                args_c_arr[i] = c_str;
             }
             parse_args(args.size(), args_c_arr, stream_params);
-            if (stream_params.mode == TXT2IMG) {
-                instance->txtimg(stream_params);
-            } else if (stream_params.mode == IMG2IMG) {
-                instance->imgimg(stream_params);
+            if (params.model_path != stream_params.model_path ||
+                params.clip_path != stream_params.clip_path ||
+                params.vae_path != stream_params.vae_path ||
+                params.unet_path != stream_params.unet_path) {
+                instance->load_from_file(stream_params);
+            }
+            params = merge_params(params, stream_params);
+            if (!check_params(params)) {
+                continue;
+            }
+            if (params.mode == TXT2IMG) {
+                instance->txtimg(params);
+            } else if (params.mode == IMG2IMG) {
+                instance->imgimg(params);
             } else {
-                exit(1);
+                return 1;
             }
         }
     } else {
+        if (!params.model_path.empty()) {
+            if (!instance->load_from_file(params)) {
+                return 1;
+            }
+        } else {
+            if (!params.clip_path.empty() && !params.vae_path.empty() && !params.unet_path.empty()) {
+                if (!instance->load_from_file(params)) {
+                    return 1;
+                }
+            }
+        }
         if (params.mode == TXT2IMG) {
             instance->txtimg(params);
         } else if (params.mode == IMG2IMG) {
             instance->imgimg(params);
         } else {
-            exit(1);
+            return 0;
         }
     }
     return 0;
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index e3090803..5ead3c0b 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -82,6 +82,7 @@ class StableDiffusionGGML {
 
     TinyAutoEncoder tae_first_stage;
 
+    std::string model_path;
     std::string clip_path;
     std::string vae_path;
     std::string unet_path;
@@ -156,7 +157,17 @@ class StableDiffusionGGML {
                      sd_type_t wtype,
                      schedule_t schedule) {
         this->n_threads = n_threads;
-        bool standalone=vae_path != clip_path && vae_path != unet_path;
+        bool standalone = clip_path != vae_path && vae_path != unet_path;
+
+        std::string model_path;
+        if (!standalone && clip_path == vae_path) {
+            model_path = clip_path;
+        }
+
+        if (!standalone && vae_path == unet_path) {
+            model_path = vae_path;
+        }
+
         if (this->vae_decode_only != vae_decode_only) {
             this->vae_decode_only = vae_decode_only;
             if (!vae_path.empty() && first_stage_model.params_buffer_size > 0) {
@@ -178,12 +189,17 @@ class StableDiffusionGGML {
         }
         this->vae_tiling = vae_tiling;
 
-        if (this->wtype !=(ggml_type) wtype) {
+        if (this->wtype != (ggml_type) wtype) {
             this->wtype = (ggml_type) wtype;
-            // TODO: change wtype, need reload model
+            // TODO: can reload weight
+//            if (!standalone) {
+//                free_diffusions_params();
+//                load_diffusions_from_file(model_path);
+//            }
+
         }
 
-        if (this->schedule!=schedule){
+        if (this->schedule != schedule) {
             this->schedule = schedule;
             apply_schedule();
         }

From 525f54b8710b20693aa7899279210e7e2006d5be Mon Sep 17 00:00:00 2001
From: Cyberhan123 <255542417@qq.com>
Date: Sun, 4 Feb 2024 11:08:06 +0800
Subject: [PATCH 5/8] format code

---
 common.hpp            |   2 +-
 examples/cli/main.cpp |  95 ++++++++++++------------
 stable-diffusion.cpp  | 164 +++++++++++++++++++++---------------------
 stable-diffusion.h    |  12 ++--
 4 files changed, 134 insertions(+), 139 deletions(-)

diff --git a/common.hpp b/common.hpp
index 4a423d5a..b79a3c92 100644
--- a/common.hpp
+++ b/common.hpp
@@ -465,7 +465,7 @@ struct SpatialTransformer {
 #if defined(SD_USE_FLASH_ATTENTION) && !defined(SD_USE_CUBLAS) && !defined(SD_USE_METAL)
                 struct ggml_tensor* kqv = ggml_flash_attn(ctx, q, k, v, false);  // [N * n_head, h * w, d_head]
 #else
-                struct ggml_tensor* kq  = ggml_mul_mat(ctx, k, q);   // [N * n_head, h * w, max_position]
+                struct ggml_tensor* kq = ggml_mul_mat(ctx, k, q);  // [N * n_head, h * w, max_position]
                 // kq = ggml_diag_mask_inf_inplace(ctx, kq, 0);
                 kq = ggml_soft_max_inplace(ctx, kq);
 
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index fa393489..0ab66e70 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -3,10 +3,10 @@
 #include <time.h>
 #include <iostream>
 #include <random>
+#include <set>
+#include <sstream>
 #include <string>
 #include <vector>
-#include <sstream>
-#include <set>
 
 #include "preprocessing.hpp"
 #include "stable-diffusion.h"
@@ -20,9 +20,9 @@
 
 #include "stb_image_write.h"
 
-const char *rng_type_to_str[] = {
-        "std_default",
-        "cuda",
+const char* rng_type_to_str[] = {
+    "std_default",
+    "cuda",
 };
 
 // Names of the sampler method, same order as enum sample_method in stable-diffusion.h
@@ -93,7 +93,7 @@ struct SDParams {
     int64_t seed                  = 42;
     bool verbose                  = false;
     bool vae_tiling               = false;
-    bool vae_decode_only = false;
+    bool vae_decode_only          = false;
     bool control_net_cpu          = false;
     bool canny_preprocess         = false;
 };
@@ -190,7 +190,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("  -v, --verbose                      print extra info\n");
 }
 
-void parse_args(int argc, const char **argv, SDParams &params) {
+void parse_args(int argc, const char** argv, SDParams& params) {
     bool invalid_arg = false;
     std::string arg;
     for (int i = 1; i < argc; i++) {
@@ -496,7 +496,7 @@ bool check_params(SDParams params) {
     }
 
     if (params.seed < 0) {
-        srand((int) time(NULL));
+        srand((int)time(NULL));
         params.seed = rand();
     }
 
@@ -571,8 +571,7 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
     }
 }
 
-
-std::vector<std::string> parse_cin(std::string &input, std::set<std::string> ignore_args) {
+std::vector<std::string> parse_cin(std::string& input, std::set<std::string> ignore_args) {
     std::vector<std::string> inputTokens;
     std::string token;
     std::istringstream iss(input);
@@ -588,8 +587,8 @@ std::vector<std::string> parse_cin(std::string &input, std::set<std::string> ign
 
         if (word[word.length() - 1] == '"') {
             stmt += word;
-            word = stmt.substr(1, stmt.length() - 2);
-            stmt = "";
+            word    = stmt.substr(1, stmt.length() - 2);
+            stmt    = "";
             in_stmt = false;
         }
 
@@ -736,26 +735,26 @@ SDParams merge_params(SDParams dst, SDParams src) {
 
 class CliInstance {
 public:
-    sd_ctx_t *sd_ctx;
+    sd_ctx_t* sd_ctx;
 
     ~CliInstance() {
         free_sd_ctx(sd_ctx);
     }
 
-    CliInstance(const SDParams &params) {
+    CliInstance(const SDParams& params) {
         sd_ctx = new_sd_ctx(
-                params.n_threads,
-                params.vae_decode_only,
-                true,
-                params.lora_model_dir.c_str(),
-                params.rng_type,
-                params.vae_tiling,
-                params.wtype,
-                params.schedule,
-                true);
-    }
-
-    bool load_from_file(SDParams &params) {
+            params.n_threads,
+            params.vae_decode_only,
+            true,
+            params.lora_model_dir.c_str(),
+            params.rng_type,
+            params.vae_tiling,
+            params.wtype,
+            params.schedule,
+            true);
+    }
+
+    bool load_from_file(SDParams& params) {
         // free api always check if the following methods can free, so we can always free the model before load it.
         free_diffusions_params(sd_ctx);
         auto load_status = load_diffusions_from_file(sd_ctx, params.model_path.c_str());
@@ -778,7 +777,7 @@ class CliInstance {
         return load_status;
     }
 
-    void txtimg(SDParams &params) {
+    void txtimg(SDParams& params) {
         set_options(sd_ctx, params.n_threads,
                     params.vae_decode_only,
                     true,
@@ -787,7 +786,7 @@ class CliInstance {
                     params.vae_tiling,
                     params.wtype,
                     params.schedule);
-        sd_image_t *results = txt2img(sd_ctx,
+        sd_image_t* results = txt2img(sd_ctx,
                                       params.prompt.c_str(),
                                       params.negative_prompt.c_str(),
                                       params.clip_skip,
@@ -798,12 +797,11 @@ class CliInstance {
                                       params.sample_steps,
                                       params.seed,
                                       params.batch_count);
-        results = upscaler(params, results);
+        results             = upscaler(params, results);
         save_image(params, results);
-
     }
 
-    void imgimg(SDParams &params) {
+    void imgimg(SDParams& params) {
         set_options(sd_ctx, params.n_threads,
                     params.vae_decode_only,
                     true,
@@ -812,9 +810,9 @@ class CliInstance {
                     params.vae_tiling,
                     params.wtype,
                     params.schedule);
-        uint8_t *input_image_buffer = NULL;
+        uint8_t* input_image_buffer = NULL;
 
-        int c = 0;
+        int c              = 0;
         input_image_buffer = stbi_load(params.input_path.c_str(), &params.width, &params.height, &c, 3);
         if (input_image_buffer == NULL) {
             fprintf(stderr, "load image from '%s' failed\n", params.input_path.c_str());
@@ -837,12 +835,12 @@ class CliInstance {
             return;
         }
 
-        sd_image_t input_image = {(uint32_t) params.width,
-                                  (uint32_t) params.height,
+        sd_image_t input_image = {(uint32_t)params.width,
+                                  (uint32_t)params.height,
                                   3,
                                   input_image_buffer};
 
-        sd_image_t *results = img2img(sd_ctx,
+        sd_image_t* results = img2img(sd_ctx,
                                       input_image,
                                       params.prompt.c_str(),
                                       params.negative_prompt.c_str(),
@@ -855,21 +853,20 @@ class CliInstance {
                                       params.strength,
                                       params.seed,
                                       params.batch_count);
-        results = upscaler(params, results);
+        results             = upscaler(params, results);
         save_image(params, results);
     }
 
 protected:
-
-    void save_image(const SDParams &params, sd_image_t *results) {
-        size_t last = params.output_path.find_last_of(".");
+    void save_image(const SDParams& params, sd_image_t* results) {
+        size_t last            = params.output_path.find_last_of(".");
         std::string dummy_name = last != std::string::npos ? params.output_path.substr(0, last) : params.output_path;
         for (int i = 0; i < params.batch_count; i++) {
             if (results[i].data == NULL) {
                 continue;
             }
             std::string final_image_path =
-                    i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ".png" : dummy_name + ".png";
+                i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ".png" : dummy_name + ".png";
             stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
                            results[i].data, 0, get_image_params(params, params.seed + i).c_str());
             printf("save result image to '%s'\n", final_image_path.c_str());
@@ -879,10 +876,10 @@ class CliInstance {
         free(results);
     }
 
-    sd_image_t *upscaler(const SDParams &params, sd_image_t *results) {
+    sd_image_t* upscaler(const SDParams& params, sd_image_t* results) {
         int upscale_factor = 4;  // unused for RealESRGAN_x4plus_anime_6B.pth
         if (params.esrgan_path.size() > 0) {
-            upscaler_ctx_t *upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(),
+            upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(),
                                                             params.n_threads,
                                                             params.wtype);
 
@@ -908,7 +905,7 @@ class CliInstance {
     }
 };
 
-int main(int argc, const char *argv[]) {
+int main(int argc, const char* argv[]) {
     SDParams params;
 
     parse_args(argc, argv, params);
@@ -917,7 +914,7 @@ int main(int argc, const char *argv[]) {
         return 1;
     }
 
-    sd_set_log_callback(sd_log_cb, (void *) &params);
+    sd_set_log_callback(sd_log_cb, (void*)&params);
 
     if (params.verbose) {
         print_params(params);
@@ -953,14 +950,14 @@ int main(int argc, const char *argv[]) {
             std::string input;
             std::cout << "please input args: " << std::endl;
             std::getline(std::cin, input);
-            //hold an ignore cmd for feature to ignore the cmd not support
+            // hold an ignore cmd for feature to ignore the cmd not support
             std::set<std::string> ignore_cmd = {""};
-            std::vector<std::string> args = parse_cin(input, ignore_cmd);
+            std::vector<std::string> args    = parse_cin(input, ignore_cmd);
             SDParams stream_params;
-            const char **args_c_arr = new const char *[args.size()];
+            const char** args_c_arr = new const char*[args.size()];
             for (int i = 0; i < args.size(); i++) {
                 std::string arg = args[i];
-                char *c_str = new char[args[i].length() + 1];
+                char* c_str     = new char[args[i].length() + 1];
                 std::strcpy(c_str, arg.c_str());
                 args_c_arr[i] = c_str;
             }
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 8077aeb8..e29839b6 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -75,11 +75,11 @@ class StableDiffusionGGML {
     std::map<std::string, LoraModel> loras;
 
     std::shared_ptr<Denoiser> denoiser = std::make_shared<CompVisDenoiser>();
-    schedule_t schedule = DEFAULT;
+    schedule_t schedule                = DEFAULT;
 
-    ggml_backend_t backend = NULL;             // general backend
+    ggml_backend_t backend    = NULL;             // general backend
     ggml_type model_data_type = GGML_TYPE_COUNT;  // runtime weight type
-    ggml_type wtype = GGML_TYPE_COUNT;  // options weight type
+    ggml_type wtype           = GGML_TYPE_COUNT;  // options weight type
 
     TinyAutoEncoder tae_first_stage;
 
@@ -104,15 +104,15 @@ class StableDiffusionGGML {
                         ggml_type wtype,
                         schedule_t schedule,
                         bool init_backend_immediately = true)
-            : n_threads(n_threads),
-              vae_decode_only(vae_decode_only),
-              free_params_immediately(free_params_immediately),
-              lora_model_dir(lora_model_dir),
-              vae_tiling(vae_tiling),
-              wtype(wtype),
-              schedule(schedule) {
+        : n_threads(n_threads),
+          vae_decode_only(vae_decode_only),
+          free_params_immediately(free_params_immediately),
+          lora_model_dir(lora_model_dir),
+          vae_tiling(vae_tiling),
+          wtype(wtype),
+          schedule(schedule) {
         first_stage_model.decode_only = vae_decode_only;
-        tae_first_stage.decode_only = vae_decode_only;
+        tae_first_stage.decode_only   = vae_decode_only;
         if (rng_type == STD_DEFAULT_RNG) {
             rng = std::make_shared<STDDefaultRNG>();
         } else if (rng_type == CUDA_RNG) {
@@ -152,13 +152,13 @@ class StableDiffusionGGML {
     }
 
     void set_options(int n_threads,
-                    bool vae_decode_only,
-                    bool free_params_immediately,
-                    std::string lora_model_dir,
-                    rng_type_t rng_type,
-                    bool vae_tiling,
-                    sd_type_t wtype,
-                    schedule_t schedule) {
+                     bool vae_decode_only,
+                     bool free_params_immediately,
+                     std::string lora_model_dir,
+                     rng_type_t rng_type,
+                     bool vae_tiling,
+                     sd_type_t wtype,
+                     schedule_t schedule) {
         this->n_threads = n_threads;
         bool standalone = clip_path != vae_path && vae_path != unet_path;
 
@@ -184,7 +184,7 @@ class StableDiffusionGGML {
         }
 
         this->free_params_immediately = free_params_immediately;
-        this->lora_model_dir = std::move(lora_model_dir);
+        this->lora_model_dir          = std::move(lora_model_dir);
         if (rng_type == STD_DEFAULT_RNG) {
             rng = std::make_shared<STDDefaultRNG>();
         } else if (rng_type == CUDA_RNG) {
@@ -192,14 +192,13 @@ class StableDiffusionGGML {
         }
         this->vae_tiling = vae_tiling;
 
-        if (this->wtype != (ggml_type) wtype) {
-            this->wtype = (ggml_type) wtype;
+        if (this->wtype != (ggml_type)wtype) {
+            this->wtype = (ggml_type)wtype;
             // TODO: can reload weight
             //            if (!standalone) {
             //                free_diffusions_params();
             //                load_diffusions_from_file(model_path);
             //            }
-
         }
 
         if (this->schedule != schedule) {
@@ -208,7 +207,7 @@ class StableDiffusionGGML {
         }
     }
 
-     bool load_clip_from_file(const std::string &model_path, bool standalone = true, const std::string &prefix = "te.") {
+    bool load_clip_from_file(const std::string& model_path, bool standalone = true, const std::string& prefix = "te.") {
         if (backend == NULL) {
             LOG_ERROR("if you set init_backend_immediately false, please call init_backend first");
             return false;
@@ -274,11 +273,11 @@ class StableDiffusionGGML {
         }
 
         struct ggml_init_params params;
-        params.mem_size = static_cast<size_t>(3 * 1024) * 1024;  // 3M
+        params.mem_size   = static_cast<size_t>(3 * 1024) * 1024;  // 3M
         params.mem_buffer = NULL;
-        params.no_alloc = false;
+        params.no_alloc   = false;
         // LOG_DEBUG("mem_size %u ", params.mem_size);
-        struct ggml_context *ctx = ggml_init(params);  // for  alphas_cumprod and is_using_v_parameterization check
+        struct ggml_context* ctx = ggml_init(params);  // for  alphas_cumprod and is_using_v_parameterization check
         if (!ctx) {
             LOG_ERROR("ggml_init() failed");
             return false;
@@ -288,10 +287,10 @@ class StableDiffusionGGML {
         LOG_DEBUG("loading clip weights");
         int64_t t0 = ggml_time_ms();
 
-        std::map<std::string, struct ggml_tensor *> tensors_need_to_load;
+        std::map<std::string, struct ggml_tensor*> tensors_need_to_load;
         std::set<std::string> ignore_tensors;
 
-        for (auto &pair: tensors) {
+        for (auto& pair : tensors) {
             tensors_need_to_load.insert(pair);
         }
 
@@ -316,9 +315,9 @@ class StableDiffusionGGML {
         }
     }
 
-    bool load_unet_from_file(const std::string &model_path,
-                             bool standalone = true,
-                             const std::string &prefix = "unet.") {
+    bool load_unet_from_file(const std::string& model_path,
+                             bool standalone           = true,
+                             const std::string& prefix = "unet.") {
         if (backend == NULL) {
             LOG_ERROR("if you set init_backend_immediately false, please call init_backend first");
             return false;
@@ -351,11 +350,11 @@ class StableDiffusionGGML {
         }
 
         struct ggml_init_params params;
-        params.mem_size = static_cast<size_t>(3 * 1024) * 1024;  // 3M
+        params.mem_size   = static_cast<size_t>(3 * 1024) * 1024;  // 3M
         params.mem_buffer = NULL;
-        params.no_alloc = false;
+        params.no_alloc   = false;
 
-        struct ggml_context *ctx = ggml_init(params);  // for  alphas_cumprod and is_using_v_parameterization check
+        struct ggml_context* ctx = ggml_init(params);  // for  alphas_cumprod and is_using_v_parameterization check
 
         if (!ctx) {
             LOG_ERROR("ggml_init() failed");
@@ -366,13 +365,13 @@ class StableDiffusionGGML {
         LOG_DEBUG("loading weights");
         int64_t t0 = ggml_time_ms();
 
-        std::map<std::string, struct ggml_tensor *> tensors_need_to_load;
+        std::map<std::string, struct ggml_tensor*> tensors_need_to_load;
         std::set<std::string> ignore_tensors;
-        ggml_tensor *alphas_cumprod_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, TIMESTEPS);
-        calculate_alphas_cumprod((float *) alphas_cumprod_tensor->data);
+        ggml_tensor* alphas_cumprod_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, TIMESTEPS);
+        calculate_alphas_cumprod((float*)alphas_cumprod_tensor->data);
         tensors_need_to_load["alphas_cumprod"] = alphas_cumprod_tensor;
-        for (auto &pair: tensors) {
-            const std::string &name = pair.first;
+        for (auto& pair : tensors) {
+            const std::string& name = pair.first;
             if (starts_with(name, "cond_stage_model.") || starts_with(name, "first_stage_model.")) {
                 ignore_tensors.insert(name);
                 continue;
@@ -415,9 +414,9 @@ class StableDiffusionGGML {
         }
     }
 
-    bool load_vae_from_file(const std::string &model_path,
-                            bool standalone = true,
-                            const std::string &prefix = "vae.") {
+    bool load_vae_from_file(const std::string& model_path,
+                            bool standalone           = true,
+                            const std::string& prefix = "vae.") {
         if (backend == NULL) {
             LOG_ERROR("if you set init_backend_immediately false, please call init_backend first");
             return false;
@@ -454,11 +453,11 @@ class StableDiffusionGGML {
         }
 
         struct ggml_init_params params;
-        params.mem_size = static_cast<size_t>(10 * 1024) * 1024;  // 3M
+        params.mem_size   = static_cast<size_t>(10 * 1024) * 1024;  // 3M
         params.mem_buffer = NULL;
-        params.no_alloc = false;
+        params.no_alloc   = false;
         // LOG_DEBUG("mem_size %u ", params.mem_size);
-        struct ggml_context *ctx = ggml_init(params);  // for  alphas_cumprod and is_using_v_parameterization check
+        struct ggml_context* ctx = ggml_init(params);  // for  alphas_cumprod and is_using_v_parameterization check
         if (!ctx) {
             LOG_ERROR("ggml_init() failed");
             return false;
@@ -468,10 +467,10 @@ class StableDiffusionGGML {
         LOG_DEBUG("loading weights");
         int64_t t0 = ggml_time_ms();
 
-        std::map<std::string, struct ggml_tensor *> tensors_need_to_load;
+        std::map<std::string, struct ggml_tensor*> tensors_need_to_load;
         std::set<std::string> ignore_tensors;
-        for (auto &pair: tensors) {
-            const std::string &name = pair.first;
+        for (auto& pair : tensors) {
+            const std::string& name = pair.first;
             if (vae_decode_only &&
                 (starts_with(name, "first_stage_model.encoder") || starts_with(name, "first_stage_model.quant"))) {
                 ignore_tensors.insert(name);
@@ -500,7 +499,7 @@ class StableDiffusionGGML {
     }
 
     // load the all model from one file
-    bool load_diffusions_from_file(const std::string &model_path) {
+    bool load_diffusions_from_file(const std::string& model_path) {
         LOG_INFO("loading model from '%s'", model_path.c_str());
         if (!load_clip_from_file(model_path, false, "")) {
             free_clip_params();
@@ -534,7 +533,7 @@ class StableDiffusionGGML {
         LOG_INFO("free vae params");
     }
 
-    bool load_taesd_from_file(const std::string &taesd_path) {
+    bool load_taesd_from_file(const std::string& taesd_path) {
         if (first_stage_model.params_buffer_size > 0) {
             free_vae_params();
         }
@@ -542,7 +541,7 @@ class StableDiffusionGGML {
             return false;
         }
 
-        this->taesd_path = taesd_path;
+        this->taesd_path     = taesd_path;
         use_tiny_autoencoder = true;
         return true;
     }
@@ -826,25 +825,25 @@ class StableDiffusionGGML {
             switch (schedule) {
                 case DISCRETE:
                     LOG_INFO("running with discrete schedule");
-                denoiser->schedule = std::make_shared<DiscreteSchedule>();
-                break;
+                    denoiser->schedule = std::make_shared<DiscreteSchedule>();
+                    break;
                 case KARRAS:
                     LOG_INFO("running with Karras schedule");
-                denoiser->schedule = std::make_shared<KarrasSchedule>();
-                break;
+                    denoiser->schedule = std::make_shared<KarrasSchedule>();
+                    break;
                 case DEFAULT:
                     // Don't touch anything.
-                        break;
+                    break;
                 default:
                     LOG_ERROR("Unknown schedule %i", schedule);
-                abort();
+                    abort();
             }
         }
 
         for (int i = 0; i < TIMESTEPS; i++) {
             denoiser->schedule->alphas_cumprod[i] = alphas_cumprod_tensor[i];
-            denoiser->schedule->sigmas[i] = std::sqrt(
-                    (1 - denoiser->schedule->alphas_cumprod[i]) / denoiser->schedule->alphas_cumprod[i]);
+            denoiser->schedule->sigmas[i]         = std::sqrt(
+                (1 - denoiser->schedule->alphas_cumprod[i]) / denoiser->schedule->alphas_cumprod[i]);
             denoiser->schedule->log_sigmas[i] = std::log(denoiser->schedule->sigmas[i]);
         }
     }
@@ -1614,14 +1613,14 @@ struct sd_ctx_t {
 sd_ctx_t* new_sd_ctx(int n_threads,
                      bool vae_decode_only,
                      bool free_params_immediately,
-                     const char *lora_model_dir_c_str,
+                     const char* lora_model_dir_c_str,
                      enum rng_type_t rng_type,
                      bool vae_tiling,
                      enum sd_type_t wtype,
                      enum schedule_t s,
                      bool keep_control_net_cpu,
                      bool init_backend_immediately) {
-    sd_ctx_t *sd_ctx = (sd_ctx_t *) malloc(sizeof(sd_ctx_t));
+    sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t));
     if (sd_ctx == NULL) {
         return NULL;
     }
@@ -1647,8 +1646,7 @@ void free_sd_ctx(sd_ctx_t* sd_ctx) {
     free(sd_ctx);
 }
 
-
-void init_backend(sd_ctx_t *sd_ctx) {
+void init_backend(sd_ctx_t* sd_ctx) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return;
@@ -1656,11 +1654,11 @@ void init_backend(sd_ctx_t *sd_ctx) {
     sd_ctx->sd->init_backend();
 }
 
-void set_options(sd_ctx_t *sd_ctx,
+void set_options(sd_ctx_t* sd_ctx,
                  int n_threads,
                  bool vae_decode_only,
                  bool free_params_immediately,
-                 const char *lora_model_dir,
+                 const char* lora_model_dir,
                  rng_type_t rng_type,
                  bool vae_tiling,
                  sd_type_t wtype,
@@ -1670,17 +1668,17 @@ void set_options(sd_ctx_t *sd_ctx,
         return;
     }
     sd_ctx->sd->set_options(
-            n_threads,
-            vae_decode_only,
-            free_params_immediately,
-            std::string(lora_model_dir),
-            rng_type,
-            vae_tiling,
-            wtype,
-            schedule);
+        n_threads,
+        vae_decode_only,
+        free_params_immediately,
+        std::string(lora_model_dir),
+        rng_type,
+        vae_tiling,
+        wtype,
+        schedule);
 }
 
-bool load_clip_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix) {
+bool load_clip_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* prefix) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return false;
@@ -1688,7 +1686,7 @@ bool load_clip_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *p
     return sd_ctx->sd->load_clip_from_file(std::string(model_path), true, std::string(prefix));
 }
 
-void free_clip_params(sd_ctx_t *sd_ctx) {
+void free_clip_params(sd_ctx_t* sd_ctx) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return;
@@ -1696,7 +1694,7 @@ void free_clip_params(sd_ctx_t *sd_ctx) {
     sd_ctx->sd->free_clip_params();
 }
 
-bool load_unet_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix) {
+bool load_unet_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* prefix) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return false;
@@ -1704,7 +1702,7 @@ bool load_unet_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *p
     return sd_ctx->sd->load_unet_from_file(std::string(model_path), true, std::string(prefix));
 }
 
-void free_unet_params(sd_ctx_t *sd_ctx) {
+void free_unet_params(sd_ctx_t* sd_ctx) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return;
@@ -1712,7 +1710,7 @@ void free_unet_params(sd_ctx_t *sd_ctx) {
     sd_ctx->sd->free_unet_params();
 }
 
-bool load_vae_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix) {
+bool load_vae_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* prefix) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return false;
@@ -1720,7 +1718,7 @@ bool load_vae_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *pr
     return sd_ctx->sd->load_vae_from_file(std::string(model_path), true, std::string(prefix));
 }
 
-void free_vae_params(sd_ctx_t *sd_ctx) {
+void free_vae_params(sd_ctx_t* sd_ctx) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return;
@@ -1728,7 +1726,7 @@ void free_vae_params(sd_ctx_t *sd_ctx) {
     sd_ctx->sd->free_vae_params();
 }
 
-bool load_taesd_from_file(sd_ctx_t *sd_ctx, const char *model_path) {
+bool load_taesd_from_file(sd_ctx_t* sd_ctx, const char* model_path) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return false;
@@ -1736,7 +1734,7 @@ bool load_taesd_from_file(sd_ctx_t *sd_ctx, const char *model_path) {
     return sd_ctx->sd->load_taesd_from_file(std::string(model_path));
 }
 
-void free_taesd_params(sd_ctx_t *sd_ctx) {
+void free_taesd_params(sd_ctx_t* sd_ctx) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return;
@@ -1745,7 +1743,7 @@ void free_taesd_params(sd_ctx_t *sd_ctx) {
 }
 
 // load all model from one file
-bool load_diffusions_from_file(sd_ctx_t *sd_ctx, const char *model_path) {
+bool load_diffusions_from_file(sd_ctx_t* sd_ctx, const char* model_path) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return false;
@@ -1754,7 +1752,7 @@ bool load_diffusions_from_file(sd_ctx_t *sd_ctx, const char *model_path) {
 }
 
 // free all model from one file
-void free_diffusions_params(sd_ctx_t *sd_ctx) {
+void free_diffusions_params(sd_ctx_t* sd_ctx) {
     if (sd_ctx == NULL || sd_ctx->sd == NULL) {
         LOG_ERROR("must call new_sd_ctx first");
         return;
diff --git a/stable-diffusion.h b/stable-diffusion.h
index a9f142af..642f3a3d 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -65,12 +65,12 @@ enum sd_type_t {
     SD_TYPE_Q8_0 = 8,
     SD_TYPE_Q8_1 = 9,
     // k-quantizations
-    SD_TYPE_Q2_K = 10,
-    SD_TYPE_Q3_K = 11,
-    SD_TYPE_Q4_K = 12,
-    SD_TYPE_Q5_K = 13,
-    SD_TYPE_Q6_K = 14,
-    SD_TYPE_Q8_K = 15,
+    SD_TYPE_Q2_K    = 10,
+    SD_TYPE_Q3_K    = 11,
+    SD_TYPE_Q4_K    = 12,
+    SD_TYPE_Q5_K    = 13,
+    SD_TYPE_Q6_K    = 14,
+    SD_TYPE_Q8_K    = 15,
     SD_TYPE_IQ2_XXS = 16,
     SD_TYPE_I8,
     SD_TYPE_I16,

From 1e7ea7bf7a3537139ef8a71d805d472d2c46d3c4 Mon Sep 17 00:00:00 2001
From: Cyberhan123 <255542417@qq.com>
Date: Sun, 4 Feb 2024 17:05:55 +0800
Subject: [PATCH 6/8] fix cli

---
 examples/cli/main.cpp | 28 ++++++++++++++++++++++++----
 stable-diffusion.cpp  |  3 +++
 stable-diffusion.h    | 15 ++++++---------
 3 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 0ab66e70..6045a4a7 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -745,12 +745,13 @@ class CliInstance {
         sd_ctx = new_sd_ctx(
             params.n_threads,
             params.vae_decode_only,
-            true,
+            false,
             params.lora_model_dir.c_str(),
             params.rng_type,
             params.vae_tiling,
             params.wtype,
             params.schedule,
+            params.control_net_cpu,
             true);
     }
 
@@ -786,6 +787,23 @@ class CliInstance {
                     params.vae_tiling,
                     params.wtype,
                     params.schedule);
+        int c                       = 0;
+        uint8_t* input_image_buffer = stbi_load(params.control_image_path.c_str(), &params.width, &params.height, &c, 3);
+        if (input_image_buffer == NULL) {
+            fprintf(stderr, "load image from '%s' failed\n", params.control_image_path.c_str());
+            return;
+        }
+        if (c != 3) {
+            fprintf(stderr, "input image must be a 3 channels RGB image, but got %d channels\n", c);
+            free(input_image_buffer);
+            return;
+        }
+
+        sd_image_t input_image = {(uint32_t)params.width,
+                                  (uint32_t)params.height,
+                                  3,
+                                  input_image_buffer};
+
         sd_image_t* results = txt2img(sd_ctx,
                                       params.prompt.c_str(),
                                       params.negative_prompt.c_str(),
@@ -796,8 +814,11 @@ class CliInstance {
                                       params.sample_method,
                                       params.sample_steps,
                                       params.seed,
-                                      params.batch_count);
-        results             = upscaler(params, results);
+                                      params.batch_count,
+                                      &input_image,
+                                      params.control_strength);
+
+        results = upscaler(params, results);
         save_image(params, results);
     }
 
@@ -882,7 +903,6 @@ class CliInstance {
             upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(),
                                                             params.n_threads,
                                                             params.wtype);
-
             if (upscaler_ctx == NULL) {
                 printf("new_upscaler_ctx failed\n");
             } else {
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index e29839b6..bbfeb8c8 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -552,6 +552,9 @@ class StableDiffusionGGML {
         }
     }
 
+    bool load_control_net_from_file(const std::string& control_path) {
+    }
+
     bool load_from_file(const std::string& model_path,
                         const std::string& vae_path,
                         const std::string& control_net_path,
diff --git a/stable-diffusion.h b/stable-diffusion.h
index 642f3a3d..bc3637da 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -102,19 +102,16 @@ typedef struct {
 
 typedef struct sd_ctx_t sd_ctx_t;
 
-SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
-                            const char* vae_path,
-                            const char* taesd_path,
-                            const char* control_net_path_c_str,
-                            const char* lora_model_dir,
-                            const char* embed_dir_c_str,
+SD_API sd_ctx_t* new_sd_ctx(int n_threads,
                             bool vae_decode_only,
-                            bool vae_tiling,
                             bool free_params_immediately,
-                            int n_threads,
+                            const char* lora_model_dir_c_str,
+                            enum rng_type_t rng_type,
+                            bool vae_tiling,
                             enum sd_type_t wtype,
                             enum schedule_t s,
-                            bool init_backend_immediately = true);
+                            bool keep_control_net_cpu,
+                            bool init_backend_immediately);
 
 SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
 

From bdb8250d972a4b874ba0c1ae106141c0c628ae77 Mon Sep 17 00:00:00 2001
From: Cyberhan123 <255542417@qq.com>
Date: Sun, 4 Feb 2024 17:17:12 +0800
Subject: [PATCH 7/8] fix build fail on darwin and linux

---
 examples/cli/main.cpp | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 6045a4a7..09abee85 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -98,18 +98,6 @@ struct SDParams {
     bool canny_preprocess         = false;
 };
 
-static std::string sd_basename(const std::string& path) {
-    size_t pos = path.find_last_of('/');
-    if (pos != std::string::npos) {
-        return path.substr(pos + 1);
-    }
-    pos = path.find_last_of('\\');
-    if (pos != std::string::npos) {
-        return path.substr(pos + 1);
-    }
-    return path;
-}
-
 void print_params(SDParams params) {
     printf("Option: \n");
     printf("    n_threads:         %d\n", params.n_threads);

From f1f24450819769468250d742e72a2eec076e50e3 Mon Sep 17 00:00:00 2001
From: Cyberhan123 <255542417@qq.com>
Date: Mon, 5 Feb 2024 16:40:34 +0800
Subject: [PATCH 8/8] format code

---
 examples/cli/main.cpp | 80 +++++++++++++++++++++++++++++++++++++++++++
 model.cpp             | 65 ++++++++++++-----------------------
 stable-diffusion.cpp  | 22 ++++++++++--
 3 files changed, 121 insertions(+), 46 deletions(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 09abee85..50a234b7 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -651,6 +651,18 @@ SDParams merge_params(SDParams dst, SDParams src) {
         }
     }
 
+    if (dst.controlnet_path != src.control_image_path) {
+        if (!src.controlnet_path.empty()) {
+            dst.controlnet_path = src.controlnet_path;
+        }
+    }
+
+    if (dst.embeddings_path != src.embeddings_path) {
+        if (!src.embeddings_path.empty()) {
+            dst.embeddings_path = src.embeddings_path;
+        }
+    }
+
     if (dst.wtype != src.wtype) {
         dst.wtype = src.wtype;
     }
@@ -667,6 +679,18 @@ SDParams merge_params(SDParams dst, SDParams src) {
         }
     }
 
+    if (dst.input_path != src.input_path) {
+        if (!src.input_path.empty()) {
+            dst.input_path = src.input_path;
+        }
+    }
+
+    if (dst.control_image_path != src.control_image_path) {
+        if (!src.control_image_path.empty()) {
+            dst.control_image_path = src.control_image_path;
+        }
+    }
+
     if (dst.prompt != src.prompt) {
         if (!src.prompt.empty()) {
             dst.prompt = src.prompt;
@@ -701,6 +725,30 @@ SDParams merge_params(SDParams dst, SDParams src) {
         }
     }
 
+    if (dst.batch_count != src.batch_count) {
+        if (src.batch_count > 0) {
+            dst.batch_count = src.batch_count;
+        }
+    }
+
+    if (dst.batch_count != src.batch_count) {
+        if (src.batch_count > 0) {
+            dst.batch_count = src.batch_count;
+        }
+    }
+
+    if (dst.sample_method != src.sample_method) {
+        if (src.sample_method < N_SAMPLE_METHODS) {
+            dst.sample_method = src.sample_method;
+        }
+    }
+
+    if (dst.schedule != src.schedule) {
+        if (src.schedule < N_SAMPLE_METHODS) {
+            dst.schedule = src.schedule;
+        }
+    }
+
     if (dst.sample_steps != src.sample_steps) {
         if (src.sample_steps > 0) {
             dst.sample_steps = src.sample_steps;
@@ -713,11 +761,43 @@ SDParams merge_params(SDParams dst, SDParams src) {
         }
     }
 
+    if (dst.control_strength != src.control_strength) {
+        if (src.control_strength >= 0.f && src.control_strength <= 1.f) {
+            dst.control_strength = src.control_strength;
+        }
+    }
+
+    if (dst.rng_type != src.rng_type) {
+        if (src.rng_type < CUDA_RNG) {
+            dst.rng_type = src.rng_type;
+        }
+    }
+
     if (dst.seed != src.seed) {
         if (src.seed > 0) {
             dst.seed = src.seed;
         }
     }
+
+    if (dst.verbose != src.verbose) {
+        dst.verbose = src.verbose;
+    }
+
+    if (dst.vae_tiling != src.vae_tiling) {
+        dst.verbose = src.verbose;
+    }
+
+    if (dst.vae_decode_only != src.vae_decode_only) {
+        dst.vae_decode_only = src.vae_decode_only;
+    }
+
+    if (dst.control_net_cpu != src.control_net_cpu) {
+        dst.control_net_cpu = src.control_net_cpu;
+    }
+
+    if (dst.canny_preprocess != src.canny_preprocess) {
+        dst.canny_preprocess = src.canny_preprocess;
+    }
     return dst;
 }
 
diff --git a/model.cpp b/model.cpp
index 847f612c..59e788b1 100644
--- a/model.cpp
+++ b/model.cpp
@@ -275,48 +275,40 @@ std::string convert_diffusers_name_to_compvis(const std::string& key, char seq)
     }
 
     if (match(m, std::regex(format("unet%ctime_embedding%clinear_(\\d+)(.*)", seq, seq)), key)) {
-        return format("model%cdiffusion_model%ctime_embed%c", seq, seq, seq) + std::to_string(std::stoi(m[0]) * 2 - 2) +
-               m[1];
+        return format("model%cdiffusion_model%ctime_embed%c", seq, seq, seq) + std::to_string(std::stoi(m[0]) * 2 - 2) + m[1];
     }
 
     if (match(m, std::regex(format("unet%cdown_blocks%c(\\d+)%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) {
         std::string suffix = get_converted_suffix(m[1], m[3]);
         // LOG_DEBUG("%s %s %s %s", m[0].c_str(), m[1].c_str(), m[2].c_str(), m[3].c_str());
-        return format("model%cdiffusion_model%cinput_blocks%c", seq, seq, seq) +
-               std::to_string(1 + std::stoi(m[0]) * 3 + std::stoi(m[2])) + seq +
+        return format("model%cdiffusion_model%cinput_blocks%c", seq, seq, seq) + std::to_string(1 + std::stoi(m[0]) * 3 + std::stoi(m[2])) + seq +
                (m[1] == "attentions" ? "1" : "0") + seq + suffix;
     }
 
     if (match(m, std::regex(format("unet%cmid_block%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq)), key)) {
         std::string suffix = get_converted_suffix(m[0], m[2]);
-        return format("model%cdiffusion_model%cmiddle_block%c", seq, seq, seq) +
-               (m[0] == "attentions" ? "1" : std::to_string(std::stoi(m[1]) * 2)) +
+        return format("model%cdiffusion_model%cmiddle_block%c", seq, seq, seq) + (m[0] == "attentions" ? "1" : std::to_string(std::stoi(m[1]) * 2)) +
                seq + suffix;
     }
 
     if (match(m, std::regex(format("unet%cup_blocks%c(\\d+)%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) {
         std::string suffix = get_converted_suffix(m[1], m[3]);
-        return format("model%cdiffusion_model%coutput_blocks%c", seq, seq, seq) +
-               std::to_string(std::stoi(m[0]) * 3 + std::stoi(m[2])) + seq +
+        return format("model%cdiffusion_model%coutput_blocks%c", seq, seq, seq) + std::to_string(std::stoi(m[0]) * 3 + std::stoi(m[2])) + seq +
                (m[1] == "attentions" ? "1" : "0") + seq + suffix;
     }
 
-    if (match(m, std::regex(format("unet%cdown_blocks%c(\\d+)%cdownsamplers%c0%cconv", seq, seq, seq, seq, seq)),
-              key)) {
-        return format("model%cdiffusion_model%cinput_blocks%c", seq, seq, seq) +
-               std::to_string(3 + std::stoi(m[0]) * 3) + seq + "0" + seq + "op";
+    if (match(m, std::regex(format("unet%cdown_blocks%c(\\d+)%cdownsamplers%c0%cconv", seq, seq, seq, seq, seq)), key)) {
+        return format("model%cdiffusion_model%cinput_blocks%c", seq, seq, seq) + std::to_string(3 + std::stoi(m[0]) * 3) + seq + "0" + seq + "op";
     }
 
     if (match(m, std::regex(format("unet%cup_blocks%c(\\d+)%cupsamplers%c0%cconv", seq, seq, seq, seq, seq)), key)) {
-        return format("model%cdiffusion_model%coutput_blocks%c", seq, seq, seq) +
-               std::to_string(2 + std::stoi(m[0]) * 3) + seq +
+        return format("model%cdiffusion_model%coutput_blocks%c", seq, seq, seq) + std::to_string(2 + std::stoi(m[0]) * 3) + seq +
                (std::stoi(m[0]) > 0 ? "2" : "1") + seq + "conv";
     }
 
     // clip
     if (match(m, std::regex(format("te%ctext_model%cencoder%clayers%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) {
-        return format("cond_stage_model%ctransformer%ctext_model%cencoder%clayers%c", seq, seq, seq, seq, seq) + m[0] +
-               seq + m[1];
+        return format("cond_stage_model%ctransformer%ctext_model%cencoder%clayers%c", seq, seq, seq, seq, seq) + m[0] + seq + m[1];
     }
 
     if (match(m, std::regex(format("te%ctext_model(.*)", seq)), key)) {
@@ -328,9 +320,7 @@ std::string convert_diffusers_name_to_compvis(const std::string& key, char seq)
         return format("first_stage_model%c%s%cnorm_out%s", seq, m[0].c_str(), seq, m[1].c_str());
     }
 
-    if (match(m,
-              std::regex(format("vae%c(.*)%cmid_block%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)),
-              key)) {
+    if (match(m, std::regex(format("vae%c(.*)%cmid_block%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) {
         std::string suffix;
         std::string block_name;
         if (m[1] == "attentions") {
@@ -344,9 +334,7 @@ std::string convert_diffusers_name_to_compvis(const std::string& key, char seq)
                       seq, m[0].c_str(), seq, seq, block_name.c_str(), std::stoi(m[2]) + 1, seq, suffix.c_str());
     }
 
-    if (match(m,
-              std::regex(format("vae%c(.*)%cup_blocks%c(\\d+)%cresnets%c(\\d+)%c(.+)", seq, seq, seq, seq, seq, seq)),
-              key)) {
+    if (match(m, std::regex(format("vae%c(.*)%cup_blocks%c(\\d+)%cresnets%c(\\d+)%c(.+)", seq, seq, seq, seq, seq, seq)), key)) {
         std::string suffix = m[3];
         if (suffix == "conv_shortcut") {
             suffix = "nin_shortcut";
@@ -355,16 +343,12 @@ std::string convert_diffusers_name_to_compvis(const std::string& key, char seq)
                       seq, m[0].c_str(), seq, seq, 3 - std::stoi(m[1]), seq, seq, m[2].c_str(), seq, suffix.c_str());
     }
 
-    if (match(m,
-              std::regex(format("vae%c(.*)%cdown_blocks%c(\\d+)%cdownsamplers%c0%cconv", seq, seq, seq, seq, seq, seq)),
-              key)) {
+    if (match(m, std::regex(format("vae%c(.*)%cdown_blocks%c(\\d+)%cdownsamplers%c0%cconv", seq, seq, seq, seq, seq, seq)), key)) {
         return format("first_stage_model%c%s%cdown%c%d%cdownsample%cconv",
                       seq, m[0].c_str(), seq, seq, std::stoi(m[1]), seq, seq);
     }
 
-    if (match(m,
-              std::regex(format("vae%c(.*)%cdown_blocks%c(\\d+)%cresnets%c(\\d+)%c(.+)", seq, seq, seq, seq, seq, seq)),
-              key)) {
+    if (match(m, std::regex(format("vae%c(.*)%cdown_blocks%c(\\d+)%cresnets%c(\\d+)%c(.+)", seq, seq, seq, seq, seq, seq)), key)) {
         std::string suffix = m[3];
         if (suffix == "conv_shortcut") {
             suffix = "nin_shortcut";
@@ -373,8 +357,7 @@ std::string convert_diffusers_name_to_compvis(const std::string& key, char seq)
                       seq, m[0].c_str(), seq, seq, std::stoi(m[1]), seq, seq, m[2].c_str(), seq, suffix.c_str());
     }
 
-    if (match(m, std::regex(format("vae%c(.*)%cup_blocks%c(\\d+)%cupsamplers%c0%cconv", seq, seq, seq, seq, seq, seq)),
-              key)) {
+    if (match(m, std::regex(format("vae%c(.*)%cup_blocks%c(\\d+)%cupsamplers%c0%cconv", seq, seq, seq, seq, seq, seq)), key)) {
         return format("first_stage_model%c%s%cup%c%d%cupsample%cconv",
                       seq, m[0].c_str(), seq, seq, 3 - std::stoi(m[1]), seq, seq);
     }
@@ -511,9 +494,8 @@ void convert_tensor(void* src, ggml_type src_type, void* dst, ggml_type dst_type
         } else {
             auto qtype = ggml_internal_get_type_traits(src_type);
             if (qtype.to_float == NULL) {
-                throw std::runtime_error(
-                    format("type %s unsupported for integer quantization: no dequantization available",
-                           ggml_type_name(src_type)));
+                throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available",
+                                                ggml_type_name(src_type)));
             }
             qtype.to_float(src, (float*)dst, n);
         }
@@ -807,8 +789,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
             ne[i] = shape[i].get<int64_t>();
         }
 
-        TensorStorage tensor_storage(prefix + name, type, ne, n_dims, file_index,
-                                     ST_HEADER_SIZE_LEN + header_size_ + begin);
+        TensorStorage tensor_storage(prefix + name, type, ne, n_dims, file_index, ST_HEADER_SIZE_LEN + header_size_ + begin);
 
         tensor_storage.reverse_ne();
 
@@ -1067,7 +1048,7 @@ bool ModelLoader::parse_data_pkl(uint8_t* buffer,
                     break;
                 case ']':  // EMPTY_LIST     = b']'   # push empty list
                     break;
-                    // skip unused sections
+                // skip unused sections
                 case 'h':  // BINGET         = b'h'   #   "    "    "    "   "   "  ;   "    " 1-byte arg
                 case 'q':  // BINPUT         = b'q'   #   "     "    "   "   " ;   "    " 1-byte arg
                 case 'Q':  // BINPERSID      = b'Q'   #  "       "         "  ;  "  "   "     "  stack
@@ -1360,8 +1341,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
 
                     if (tensor_storage.is_bf16) {
                         // inplace op
-                        bf16_to_f32_vec((uint16_t*)dst_tensor->data, (float*)dst_tensor->data,
-                                        tensor_storage.nelements());
+                        bf16_to_f32_vec((uint16_t*)dst_tensor->data, (float*)dst_tensor->data, tensor_storage.nelements());
                     }
                 } else {
                     read_buffer.resize(tensor_storage.nbytes());
@@ -1369,8 +1349,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
 
                     if (tensor_storage.is_bf16) {
                         // inplace op
-                        bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(),
-                                        tensor_storage.nelements());
+                        bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
                     }
 
                     convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data,
@@ -1382,8 +1361,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
 
                 if (tensor_storage.is_bf16) {
                     // inplace op
-                    bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(),
-                                    tensor_storage.nelements());
+                    bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
                 }
 
                 if (tensor_storage.type == dst_tensor->type) {
@@ -1443,8 +1421,7 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
                 "tensor '%s' has wrong shape in model file: "
                 "got [%d, %d, %d, %d], expected [%d, %d, %d, %d]",
                 name.c_str(),
-                (int)tensor_storage.ne[0], (int)tensor_storage.ne[1], (int)tensor_storage.ne[2],
-                (int)tensor_storage.ne[3],
+                (int)tensor_storage.ne[0], (int)tensor_storage.ne[1], (int)tensor_storage.ne[2], (int)tensor_storage.ne[3],
                 (int)real->ne[0], (int)real->ne[1], (int)real->ne[2], (int)real->ne[3]);
             return false;
         }
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index bbfeb8c8..9f556dc4 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -552,7 +552,25 @@ class StableDiffusionGGML {
         }
     }
 
-    bool load_control_net_from_file(const std::string& control_path) {
+    bool load_control_net_from_file(const std::string& control_net_path, const std::string& embeddings_path, bool control_net_cpu) {
+        if (!control_net_path.empty()) {
+            ggml_backend_t cn_backend = NULL;
+            if (control_net_cpu && !ggml_backend_is_cpu(backend)) {
+                LOG_DEBUG("ControlNet: Using CPU backend");
+                cn_backend = ggml_backend_cpu_init();
+            } else {
+                cn_backend = backend;
+            }
+            if (!control_net.load_from_file(control_net_path, cn_backend, GGML_TYPE_F16 /* just f16 controlnet models */)) {
+                return false;
+            }
+        }
+    }
+
+    void free_control_net_params() {
+        if (control_net.params_buffer_size > 0) {
+            control_net.free_params_buffer();
+        }
     }
 
     bool load_from_file(const std::string& model_path,
@@ -560,7 +578,7 @@ class StableDiffusionGGML {
                         const std::string& control_net_path,
                         const std::string& embeddings_path,
                         const std::string& taesd_path,
-                        bool vae_tiling_,
+                        bool vae_tiling,
                         ggml_type wtype,
                         schedule_t schedule,
                         bool control_net_cpu) {