From 2b7294d5bb8a575a4b73d037f021ae778dd8c337 Mon Sep 17 00:00:00 2001 From: Cyberhan123 <255542417@qq.com> Date: Tue, 23 Jan 2024 18:02:35 +0800 Subject: [PATCH 1/8] feat: export more api for custom --- .clang-format | 1 - examples/cli/main.cpp | 165 +++--- model.cpp | 544 ++++++++++--------- model.h | 2 +- stable-diffusion.cpp | 1180 ++++++++++++++++++++++++++--------------- stable-diffusion.h | 82 ++- 6 files changed, 1209 insertions(+), 765 deletions(-) diff --git a/.clang-format b/.clang-format index 4fe720b8..37881bfc 100644 --- a/.clang-format +++ b/.clang-format @@ -3,7 +3,6 @@ UseTab: Never IndentWidth: 4 TabWidth: 4 AllowShortIfStatementsOnASingleLine: false -IndentCaseLabels: false ColumnLimit: 0 AccessModifierOffset: -4 NamespaceIndentation: All diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index af2c337d..bde19f34 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -9,40 +9,42 @@ #include "stable-diffusion.h" #define STB_IMAGE_IMPLEMENTATION + #include "stb_image.h" #define STB_IMAGE_WRITE_IMPLEMENTATION #define STB_IMAGE_WRITE_STATIC + #include "stb_image_write.h" -const char* rng_type_to_str[] = { - "std_default", - "cuda", +const char *rng_type_to_str[] = { + "std_default", + "cuda", }; // Names of the sampler method, same order as enum sample_method in stable-diffusion.h -const char* sample_method_str[] = { - "euler_a", - "euler", - "heun", - "dpm2", - "dpm++2s_a", - "dpm++2m", - "dpm++2mv2", - "lcm", +const char *sample_method_str[] = { + "euler_a", + "euler", + "heun", + "dpm2", + "dpm++2s_a", + "dpm++2m", + "dpm++2mv2", + "lcm", }; // Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h -const char* schedule_str[] = { - "default", - "discrete", - "karras", +const char *schedule_str[] = { + "default", + "discrete", + "karras", }; -const char* modes_str[] = { - "txt2img", - "img2img", - "convert", +const char *modes_str[] = { + "txt2img", + "img2img", + "convert", }; enum SDMode { @@ -54,7 +56,7 @@ enum SDMode { struct SDParams { int n_threads = -1; - SDMode mode = TXT2IMG; + SDMode mode = TXT2IMG; std::string model_path; std::string vae_path; @@ -68,22 +70,22 @@ struct SDParams { std::string prompt; std::string negative_prompt; float cfg_scale = 7.0f; - int clip_skip = -1; // <= 0 represents unspecified - int width = 512; - int height = 512; + int clip_skip = -1; // <= 0 represents unspecified + int width = 512; + int height = 512; int batch_count = 1; sample_method_t sample_method = EULER_A; - schedule_t schedule = DEFAULT; - int sample_steps = 20; - float strength = 0.75f; - rng_type_t rng_type = CUDA_RNG; - int64_t seed = 42; - bool verbose = false; - bool vae_tiling = false; + schedule_t schedule = DEFAULT; + int sample_steps = 20; + float strength = 0.75f; + rng_type_t rng_type = CUDA_RNG; + int64_t seed = 42; + bool verbose = false; + bool vae_tiling = false; }; -static std::string sd_basename(const std::string& path) { +static std::string sd_basename(const std::string &path) { size_t pos = path.find_last_of('/'); if (pos != std::string::npos) { return path.substr(pos + 1); @@ -122,7 +124,7 @@ void print_params(SDParams params) { printf(" vae_tiling: %s\n", params.vae_tiling ? "true" : "false"); } -void print_usage(int argc, const char* argv[]) { +void print_usage(int argc, const char *argv[]) { printf("usage: %s [arguments]\n", argv[0]); printf("\n"); printf("arguments:\n"); @@ -159,7 +161,7 @@ void print_usage(int argc, const char* argv[]) { printf(" -v, --verbose print extra info\n"); } -void parse_args(int argc, const char** argv, SDParams& params) { +void parse_args(int argc, const char **argv, SDParams ¶ms) { bool invalid_arg = false; std::string arg; for (int i = 1; i < argc; i++) { @@ -176,8 +178,8 @@ void parse_args(int argc, const char** argv, SDParams& params) { invalid_arg = true; break; } - const char* mode_selected = argv[i]; - int mode_found = -1; + const char *mode_selected = argv[i]; + int mode_found = -1; for (int d = 0; d < MODE_COUNT; d++) { if (!strcmp(mode_selected, modes_str[d])) { mode_found = d; @@ -188,7 +190,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { mode_selected); exit(1); } - params.mode = (SDMode)mode_found; + params.mode = (SDMode) mode_found; } else if (arg == "-m" || arg == "--model") { if (++i >= argc) { invalid_arg = true; @@ -234,7 +236,8 @@ void parse_args(int argc, const char** argv, SDParams& params) { } else if (type == "q8_0") { params.wtype = SD_TYPE_Q8_0; } else { - fprintf(stderr, "error: invalid weight format %s, must be one of [f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0]\n", + fprintf(stderr, + "error: invalid weight format %s, must be one of [f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0]\n", type.c_str()); exit(1); } @@ -331,8 +334,8 @@ void parse_args(int argc, const char** argv, SDParams& params) { invalid_arg = true; break; } - const char* schedule_selected = argv[i]; - int schedule_found = -1; + const char *schedule_selected = argv[i]; + int schedule_found = -1; for (int d = 0; d < N_SCHEDULES; d++) { if (!strcmp(schedule_selected, schedule_str[d])) { schedule_found = d; @@ -342,7 +345,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { invalid_arg = true; break; } - params.schedule = (schedule_t)schedule_found; + params.schedule = (schedule_t) schedule_found; } else if (arg == "-s" || arg == "--seed") { if (++i >= argc) { invalid_arg = true; @@ -354,8 +357,8 @@ void parse_args(int argc, const char** argv, SDParams& params) { invalid_arg = true; break; } - const char* sample_method_selected = argv[i]; - int sample_method_found = -1; + const char *sample_method_selected = argv[i]; + int sample_method_found = -1; for (int m = 0; m < N_SAMPLE_METHODS; m++) { if (!strcmp(sample_method_selected, sample_method_str[m])) { sample_method_found = m; @@ -365,7 +368,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { invalid_arg = true; break; } - params.sample_method = (sample_method_t)sample_method_found; + params.sample_method = (sample_method_t) sample_method_found; } else if (arg == "-h" || arg == "--help") { print_usage(argc, argv); exit(0); @@ -431,7 +434,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { } if (params.seed < 0) { - srand((int)time(NULL)); + srand((int) time(NULL)); params.seed = rand(); } @@ -462,8 +465,8 @@ std::string get_image_params(SDParams params, int64_t seed) { return parameter_string; } -void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) { - SDParams* params = (SDParams*)data; +void sd_log_cb(enum sd_log_level_t level, const char *log, void *data) { + SDParams *params = (SDParams *) data; if (!params->verbose && level <= SD_LOG_DEBUG) { return; } @@ -476,11 +479,11 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) { } } -int main(int argc, const char* argv[]) { +int main(int argc, const char *argv[]) { SDParams params; parse_args(argc, argv, params); - sd_set_log_callback(sd_log_cb, (void*)¶ms); + sd_set_log_callback(sd_log_cb, (void *) ¶ms); if (params.verbose) { print_params(params); @@ -488,7 +491,10 @@ int main(int argc, const char* argv[]) { } if (params.mode == CONVERT) { - bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype); + bool success = convert(params.model_path.c_str(), + params.vae_path.c_str(), + params.output_path.c_str(), + params.wtype); if (!success) { fprintf(stderr, "convert '%s'/'%s' to '%s' failed\n", @@ -505,12 +511,12 @@ int main(int argc, const char* argv[]) { } } - bool vae_decode_only = true; - uint8_t* input_image_buffer = NULL; + bool vae_decode_only = true; + uint8_t *input_image_buffer = NULL; if (params.mode == IMG2IMG) { vae_decode_only = false; - int c = 0; + int c = 0; input_image_buffer = stbi_load(params.input_path.c_str(), ¶ms.width, ¶ms.height, &c, 3); if (input_image_buffer == NULL) { fprintf(stderr, "load image from '%s' failed\n", params.input_path.c_str()); @@ -533,24 +539,45 @@ int main(int argc, const char* argv[]) { } } - sd_ctx_t* sd_ctx = new_sd_ctx(params.model_path.c_str(), - params.vae_path.c_str(), - params.taesd_path.c_str(), - params.lora_model_dir.c_str(), - vae_decode_only, - params.vae_tiling, - true, - params.n_threads, - params.wtype, - params.rng_type, - params.schedule); + sd_ctx_t *sd_ctx = new_sd_ctx( + params.n_threads, + vae_decode_only, + true, + params.lora_model_dir.c_str(), + params.rng_type, + params.vae_tiling, + params.wtype, + params.schedule, + true + ); if (sd_ctx == NULL) { printf("new_sd_ctx_t failed\n"); return 1; } - sd_image_t* results; + if (!load_diffusions_from_file(sd_ctx, params.model_path.c_str())) { + printf("load diffusions model failed\n"); + return 1; + } + + if (!params.taesd_path.empty()) { + free_unet_params(sd_ctx); + if (!load_taesd_from_file(sd_ctx, params.taesd_path.c_str())) { + printf("load taesd model failed\n"); + return 1; + } + } + + if (!params.vae_path.empty()) { + free_vae_params(sd_ctx); + if (!load_vae_from_file(sd_ctx, params.vae_path.c_str())) { + printf("load vae model failed\n"); + return 1; + } + } + + sd_image_t *results; if (params.mode == TXT2IMG) { results = txt2img(sd_ctx, params.prompt.c_str(), @@ -564,8 +591,8 @@ int main(int argc, const char* argv[]) { params.seed, params.batch_count); } else { - sd_image_t input_image = {(uint32_t)params.width, - (uint32_t)params.height, + sd_image_t input_image = {(uint32_t) params.width, + (uint32_t) params.height, 3, input_image_buffer}; @@ -592,7 +619,7 @@ int main(int argc, const char* argv[]) { int upscale_factor = 4; // unused for RealESRGAN_x4plus_anime_6B.pth if (params.esrgan_path.size() > 0) { - upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(), + upscaler_ctx_t *upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(), params.n_threads, params.wtype); @@ -614,7 +641,7 @@ int main(int argc, const char* argv[]) { } } - size_t last = params.output_path.find_last_of("."); + size_t last = params.output_path.find_last_of("."); std::string dummy_name = last != std::string::npos ? params.output_path.substr(0, last) : params.output_path; for (int i = 0; i < params.batch_count; i++) { if (results[i].data == NULL) { diff --git a/model.cpp b/model.cpp index 387a9cf5..60f4dda7 100644 --- a/model.cpp +++ b/model.cpp @@ -23,7 +23,7 @@ #define ST_HEADER_SIZE_LEN 8 -uint64_t read_u64(uint8_t* buffer) { +uint64_t read_u64(uint8_t *buffer) { // little endian uint64_t value = 0; value |= static_cast(buffer[7]) << 56; @@ -37,7 +37,7 @@ uint64_t read_u64(uint8_t* buffer) { return value; } -int32_t read_int(uint8_t* buffer) { +int32_t read_int(uint8_t *buffer) { // little endian int value = 0; value |= buffer[3] << 24; @@ -47,7 +47,7 @@ int32_t read_int(uint8_t* buffer) { return value; } -uint16_t read_short(uint8_t* buffer) { +uint16_t read_short(uint8_t *buffer) { // little endian uint16_t value = 0; value |= buffer[1] << 8; @@ -58,44 +58,44 @@ uint16_t read_short(uint8_t* buffer) { /*================================================= Preprocess ==================================================*/ std::string self_attn_names[] = { - "self_attn.q_proj.weight", - "self_attn.k_proj.weight", - "self_attn.v_proj.weight", - "self_attn.q_proj.bias", - "self_attn.k_proj.bias", - "self_attn.v_proj.bias", + "self_attn.q_proj.weight", + "self_attn.k_proj.weight", + "self_attn.v_proj.weight", + "self_attn.q_proj.bias", + "self_attn.k_proj.bias", + "self_attn.v_proj.bias", }; -const char* unused_tensors[] = { - "betas", - "alphas_cumprod_prev", - "sqrt_alphas_cumprod", - "sqrt_one_minus_alphas_cumprod", - "log_one_minus_alphas_cumprod", - "sqrt_recip_alphas_cumprod", - "sqrt_recipm1_alphas_cumprod", - "posterior_variance", - "posterior_log_variance_clipped", - "posterior_mean_coef1", - "posterior_mean_coef2", - "cond_stage_model.transformer.text_model.embeddings.position_ids", - "cond_stage_model.model.logit_scale", - "cond_stage_model.model.text_projection", - "conditioner.embedders.0.transformer.text_model.embeddings.position_ids", - "conditioner.embedders.0.model.logit_scale", - "conditioner.embedders.1.model.logit_scale", - "model.diffusion_model.time_embedding.cond_proj.weight", - "unet.time_embedding.cond_proj.weight", - "model_ema.decay", - "model_ema.num_updates", - "model_ema.diffusion_model", - "control_model", - "embedding_manager", - "denoiser.sigmas", +const char *unused_tensors[] = { + "betas", + "alphas_cumprod_prev", + "sqrt_alphas_cumprod", + "sqrt_one_minus_alphas_cumprod", + "log_one_minus_alphas_cumprod", + "sqrt_recip_alphas_cumprod", + "sqrt_recipm1_alphas_cumprod", + "posterior_variance", + "posterior_log_variance_clipped", + "posterior_mean_coef1", + "posterior_mean_coef2", + "cond_stage_model.transformer.text_model.embeddings.position_ids", + "cond_stage_model.model.logit_scale", + "cond_stage_model.model.text_projection", + "conditioner.embedders.0.transformer.text_model.embeddings.position_ids", + "conditioner.embedders.0.model.logit_scale", + "conditioner.embedders.1.model.logit_scale", + "model.diffusion_model.time_embedding.cond_proj.weight", + "unet.time_embedding.cond_proj.weight", + "model_ema.decay", + "model_ema.num_updates", + "model_ema.diffusion_model", + "control_model", + "embedding_manager", + "denoiser.sigmas", }; bool is_unused_tensor(std::string name) { - for (int i = 0; i < sizeof(unused_tensors) / sizeof(const char*); i++) { + for (int i = 0; i < sizeof(unused_tensors) / sizeof(const char *); i++) { if (starts_with(name, unused_tensors[i])) { return true; } @@ -104,54 +104,54 @@ bool is_unused_tensor(std::string name) { } std::unordered_map open_clip_to_hf_clip_model = { - {"model.ln_final.bias", "transformer.text_model.final_layer_norm.bias"}, - {"model.ln_final.weight", "transformer.text_model.final_layer_norm.weight"}, - {"model.positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"}, - {"model.token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"}, - {"model.text_projection", "transformer.text_model.text_projection"}, + {"model.ln_final.bias", "transformer.text_model.final_layer_norm.bias"}, + {"model.ln_final.weight", "transformer.text_model.final_layer_norm.weight"}, + {"model.positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"}, + {"model.token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"}, + {"model.text_projection", "transformer.text_model.text_projection"}, }; std::unordered_map open_clip_to_hk_clip_resblock = { - {"attn.out_proj.bias", "self_attn.out_proj.bias"}, - {"attn.out_proj.weight", "self_attn.out_proj.weight"}, - {"ln_1.bias", "layer_norm1.bias"}, - {"ln_1.weight", "layer_norm1.weight"}, - {"ln_2.bias", "layer_norm2.bias"}, - {"ln_2.weight", "layer_norm2.weight"}, - {"mlp.c_fc.bias", "mlp.fc1.bias"}, - {"mlp.c_fc.weight", "mlp.fc1.weight"}, - {"mlp.c_proj.bias", "mlp.fc2.bias"}, - {"mlp.c_proj.weight", "mlp.fc2.weight"}, + {"attn.out_proj.bias", "self_attn.out_proj.bias"}, + {"attn.out_proj.weight", "self_attn.out_proj.weight"}, + {"ln_1.bias", "layer_norm1.bias"}, + {"ln_1.weight", "layer_norm1.weight"}, + {"ln_2.bias", "layer_norm2.bias"}, + {"ln_2.weight", "layer_norm2.weight"}, + {"mlp.c_fc.bias", "mlp.fc1.bias"}, + {"mlp.c_fc.weight", "mlp.fc1.weight"}, + {"mlp.c_proj.bias", "mlp.fc2.bias"}, + {"mlp.c_proj.weight", "mlp.fc2.weight"}, }; std::unordered_map vae_decoder_name_map = { - {"first_stage_model.decoder.mid.attn_1.to_k.bias", "first_stage_model.decoder.mid.attn_1.k.bias"}, - {"first_stage_model.decoder.mid.attn_1.to_k.weight", "first_stage_model.decoder.mid.attn_1.k.weight"}, - {"first_stage_model.decoder.mid.attn_1.to_out.0.bias", "first_stage_model.decoder.mid.attn_1.proj_out.bias"}, - {"first_stage_model.decoder.mid.attn_1.to_out.0.weight", "first_stage_model.decoder.mid.attn_1.proj_out.weight"}, - {"first_stage_model.decoder.mid.attn_1.to_q.bias", "first_stage_model.decoder.mid.attn_1.q.bias"}, - {"first_stage_model.decoder.mid.attn_1.to_q.weight", "first_stage_model.decoder.mid.attn_1.q.weight"}, - {"first_stage_model.decoder.mid.attn_1.to_v.bias", "first_stage_model.decoder.mid.attn_1.v.bias"}, - {"first_stage_model.decoder.mid.attn_1.to_v.weight", "first_stage_model.decoder.mid.attn_1.v.weight"}, + {"first_stage_model.decoder.mid.attn_1.to_k.bias", "first_stage_model.decoder.mid.attn_1.k.bias"}, + {"first_stage_model.decoder.mid.attn_1.to_k.weight", "first_stage_model.decoder.mid.attn_1.k.weight"}, + {"first_stage_model.decoder.mid.attn_1.to_out.0.bias", "first_stage_model.decoder.mid.attn_1.proj_out.bias"}, + {"first_stage_model.decoder.mid.attn_1.to_out.0.weight", "first_stage_model.decoder.mid.attn_1.proj_out.weight"}, + {"first_stage_model.decoder.mid.attn_1.to_q.bias", "first_stage_model.decoder.mid.attn_1.q.bias"}, + {"first_stage_model.decoder.mid.attn_1.to_q.weight", "first_stage_model.decoder.mid.attn_1.q.weight"}, + {"first_stage_model.decoder.mid.attn_1.to_v.bias", "first_stage_model.decoder.mid.attn_1.v.bias"}, + {"first_stage_model.decoder.mid.attn_1.to_v.weight", "first_stage_model.decoder.mid.attn_1.v.weight"}, }; -std::string convert_open_clip_to_hf_clip(const std::string& name) { +std::string convert_open_clip_to_hf_clip(const std::string &name) { std::string new_name = name; std::string prefix; if (starts_with(new_name, "conditioner.embedders.0.")) { - prefix = "cond_stage_model."; + prefix = "cond_stage_model."; new_name = new_name.substr(strlen("conditioner.embedders.0.")); } else if (starts_with(new_name, "conditioner.embedders.1.")) { - prefix = "cond_stage_model.1."; + prefix = "cond_stage_model.1."; new_name = new_name.substr(strlen("conditioner.embedders.0.")); } else if (starts_with(new_name, "cond_stage_model.")) { - prefix = "cond_stage_model."; + prefix = "cond_stage_model."; new_name = new_name.substr(strlen("cond_stage_model.")); } else { return new_name; } std::string open_clip_resblock_prefix = "model.transformer.resblocks."; - std::string hf_clip_resblock_prefix = "transformer.text_model.encoder.layers."; + std::string hf_clip_resblock_prefix = "transformer.text_model.encoder.layers."; if (open_clip_to_hf_clip_model.find(new_name) != open_clip_to_hf_clip_model.end()) { new_name = open_clip_to_hf_clip_model[new_name]; @@ -159,21 +159,21 @@ std::string convert_open_clip_to_hf_clip(const std::string& name) { if (new_name.find(open_clip_resblock_prefix) == 0) { std::string remain = new_name.substr(open_clip_resblock_prefix.length()); - std::string idx = remain.substr(0, remain.find(".")); + std::string idx = remain.substr(0, remain.find(".")); std::string suffix = remain.substr(idx.length() + 1); if (suffix == "attn.in_proj_weight" || suffix == "attn.in_proj_bias") { new_name = hf_clip_resblock_prefix + idx + "." + suffix; } else if (open_clip_to_hk_clip_resblock.find(suffix) != open_clip_to_hk_clip_resblock.end()) { std::string new_suffix = open_clip_to_hk_clip_resblock[suffix]; - new_name = hf_clip_resblock_prefix + idx + "." + new_suffix; + new_name = hf_clip_resblock_prefix + idx + "." + new_suffix; } } return prefix + new_name; } -std::string convert_vae_decoder_name(const std::string& name) { +std::string convert_vae_decoder_name(const std::string &name) { if (vae_decoder_name_map.find(name) != vae_decoder_name_map.end()) { return vae_decoder_name_map[name]; } @@ -181,57 +181,57 @@ std::string convert_vae_decoder_name(const std::string& name) { } std::unordered_map> suffix_conversion_underline = { - { - "attentions", { - {"to_k", "k"}, - {"to_q", "q"}, - {"to_v", "v"}, - {"to_out_0", "proj_out"}, - {"group_norm", "norm"}, + "attentions", + { + {"to_k", "k"}, + {"to_q", "q"}, + {"to_v", "v"}, + {"to_out_0", "proj_out"}, + {"group_norm", "norm"}, + }, }, - }, - { - "resnets", { - {"conv1", "in_layers_2"}, - {"conv2", "out_layers_3"}, - {"norm1", "in_layers_0"}, - {"norm2", "out_layers_0"}, - {"time_emb_proj", "emb_layers_1"}, - {"conv_shortcut", "skip_connection"}, + "resnets", + { + {"conv1", "in_layers_2"}, + {"conv2", "out_layers_3"}, + {"norm1", "in_layers_0"}, + {"norm2", "out_layers_0"}, + {"time_emb_proj", "emb_layers_1"}, + {"conv_shortcut", "skip_connection"}, + }, }, - }, }; std::unordered_map> suffix_conversion_dot = { - { - "attentions", { - {"to_k", "k"}, - {"to_q", "q"}, - {"to_v", "v"}, - {"to_out.0", "proj_out"}, - {"group_norm", "norm"}, + "attentions", + { + {"to_k", "k"}, + {"to_q", "q"}, + {"to_v", "v"}, + {"to_out.0", "proj_out"}, + {"group_norm", "norm"}, + }, }, - }, - { - "resnets", { - {"conv1", "in_layers.2"}, - {"conv2", "out_layers.3"}, - {"norm1", "in_layers.0"}, - {"norm2", "out_layers.0"}, - {"time_emb_proj", "emb_layers.1"}, - {"conv_shortcut", "skip_connection"}, + "resnets", + { + {"conv1", "in_layers.2"}, + {"conv2", "out_layers.3"}, + {"norm1", "in_layers.0"}, + {"norm2", "out_layers.0"}, + {"time_emb_proj", "emb_layers.1"}, + {"conv_shortcut", "skip_connection"}, + }, }, - }, }; -std::string convert_diffusers_name_to_compvis(const std::string& key, char seq) { +std::string convert_diffusers_name_to_compvis(const std::string &key, char seq) { std::vector m; - auto match = [](std::vector& match_list, const std::regex& regex, const std::string& key) { + auto match = [](std::vector &match_list, const std::regex ®ex, const std::string &key) { auto r = std::smatch{}; if (!std::regex_match(key, r, regex)) { return false; @@ -251,7 +251,7 @@ std::string convert_diffusers_name_to_compvis(const std::string& key, char seq) suffix_conversion = suffix_conversion_dot; } - auto get_converted_suffix = [&suffix_conversion](const std::string& outer_key, const std::string& inner_key) { + auto get_converted_suffix = [&suffix_conversion](const std::string &outer_key, const std::string &inner_key) { auto outer_iter = suffix_conversion.find(outer_key); if (outer_iter != suffix_conversion.end()) { auto inner_iter = outer_iter->second.find(inner_key); @@ -276,40 +276,50 @@ std::string convert_diffusers_name_to_compvis(const std::string& key, char seq) } if (match(m, std::regex(format("unet%ctime_embedding%clinear_(\\d+)(.*)", seq, seq)), key)) { - return format("model%cdiffusion_model%ctime_embed%c", seq, seq, seq) + std::to_string(std::stoi(m[0]) * 2 - 2) + m[1]; + return format("model%cdiffusion_model%ctime_embed%c", seq, seq, seq) + std::to_string(std::stoi(m[0]) * 2 - 2) + + m[1]; } - if (match(m, std::regex(format("unet%cdown_blocks%c(\\d+)%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) { + if (match(m, std::regex( + format("unet%cdown_blocks%c(\\d+)%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) { std::string suffix = get_converted_suffix(m[1], m[3]); // LOG_DEBUG("%s %s %s %s", m[0].c_str(), m[1].c_str(), m[2].c_str(), m[3].c_str()); - return format("model%cdiffusion_model%cinput_blocks%c", seq, seq, seq) + std::to_string(1 + std::stoi(m[0]) * 3 + std::stoi(m[2])) + seq + + return format("model%cdiffusion_model%cinput_blocks%c", seq, seq, seq) + + std::to_string(1 + std::stoi(m[0]) * 3 + std::stoi(m[2])) + seq + (m[1] == "attentions" ? "1" : "0") + seq + suffix; } if (match(m, std::regex(format("unet%cmid_block%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq)), key)) { std::string suffix = get_converted_suffix(m[0], m[2]); - return format("model%cdiffusion_model%cmiddle_block%c", seq, seq, seq) + (m[0] == "attentions" ? "1" : std::to_string(std::stoi(m[1]) * 2)) + + return format("model%cdiffusion_model%cmiddle_block%c", seq, seq, seq) + + (m[0] == "attentions" ? "1" : std::to_string(std::stoi(m[1]) * 2)) + seq + suffix; } - if (match(m, std::regex(format("unet%cup_blocks%c(\\d+)%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) { + if (match(m, std::regex( + format("unet%cup_blocks%c(\\d+)%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) { std::string suffix = get_converted_suffix(m[1], m[3]); - return format("model%cdiffusion_model%coutput_blocks%c", seq, seq, seq) + std::to_string(std::stoi(m[0]) * 3 + std::stoi(m[2])) + seq + + return format("model%cdiffusion_model%coutput_blocks%c", seq, seq, seq) + + std::to_string(std::stoi(m[0]) * 3 + std::stoi(m[2])) + seq + (m[1] == "attentions" ? "1" : "0") + seq + suffix; } - if (match(m, std::regex(format("unet%cdown_blocks%c(\\d+)%cdownsamplers%c0%cconv", seq, seq, seq, seq, seq)), key)) { - return format("model%cdiffusion_model%cinput_blocks%c", seq, seq, seq) + std::to_string(3 + std::stoi(m[0]) * 3) + seq + "0" + seq + "op"; + if (match(m, std::regex(format("unet%cdown_blocks%c(\\d+)%cdownsamplers%c0%cconv", seq, seq, seq, seq, seq)), + key)) { + return format("model%cdiffusion_model%cinput_blocks%c", seq, seq, seq) + + std::to_string(3 + std::stoi(m[0]) * 3) + seq + "0" + seq + "op"; } if (match(m, std::regex(format("unet%cup_blocks%c(\\d+)%cupsamplers%c0%cconv", seq, seq, seq, seq, seq)), key)) { - return format("model%cdiffusion_model%coutput_blocks%c", seq, seq, seq) + std::to_string(2 + std::stoi(m[0]) * 3) + seq + + return format("model%cdiffusion_model%coutput_blocks%c", seq, seq, seq) + + std::to_string(2 + std::stoi(m[0]) * 3) + seq + (std::stoi(m[0]) > 0 ? "2" : "1") + seq + "conv"; } // clip if (match(m, std::regex(format("te%ctext_model%cencoder%clayers%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) { - return format("cond_stage_model%ctransformer%ctext_model%cencoder%clayers%c", seq, seq, seq, seq, seq) + m[0] + seq + m[1]; + return format("cond_stage_model%ctransformer%ctext_model%cencoder%clayers%c", seq, seq, seq, seq, seq) + m[0] + + seq + m[1]; } if (match(m, std::regex(format("te%ctext_model(.*)", seq)), key)) { @@ -321,21 +331,25 @@ std::string convert_diffusers_name_to_compvis(const std::string& key, char seq) return format("first_stage_model%c%s%cnorm_out%s", seq, m[0].c_str(), seq, m[1].c_str()); } - if (match(m, std::regex(format("vae%c(.*)%cmid_block%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) { + if (match(m, + std::regex(format("vae%c(.*)%cmid_block%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), + key)) { std::string suffix; std::string block_name; if (m[1] == "attentions") { block_name = "attn"; - suffix = get_converted_suffix(m[1], m[3]); + suffix = get_converted_suffix(m[1], m[3]); } else { block_name = "block"; - suffix = m[3]; + suffix = m[3]; } return format("first_stage_model%c%s%cmid%c%s_%d%c%s", seq, m[0].c_str(), seq, seq, block_name.c_str(), std::stoi(m[2]) + 1, seq, suffix.c_str()); } - if (match(m, std::regex(format("vae%c(.*)%cup_blocks%c(\\d+)%cresnets%c(\\d+)%c(.+)", seq, seq, seq, seq, seq, seq)), key)) { + if (match(m, + std::regex(format("vae%c(.*)%cup_blocks%c(\\d+)%cresnets%c(\\d+)%c(.+)", seq, seq, seq, seq, seq, seq)), + key)) { std::string suffix = m[3]; if (suffix == "conv_shortcut") { suffix = "nin_shortcut"; @@ -344,12 +358,16 @@ std::string convert_diffusers_name_to_compvis(const std::string& key, char seq) seq, m[0].c_str(), seq, seq, 3 - std::stoi(m[1]), seq, seq, m[2].c_str(), seq, suffix.c_str()); } - if (match(m, std::regex(format("vae%c(.*)%cdown_blocks%c(\\d+)%cdownsamplers%c0%cconv", seq, seq, seq, seq, seq, seq)), key)) { + if (match(m, + std::regex(format("vae%c(.*)%cdown_blocks%c(\\d+)%cdownsamplers%c0%cconv", seq, seq, seq, seq, seq, seq)), + key)) { return format("first_stage_model%c%s%cdown%c%d%cdownsample%cconv", seq, m[0].c_str(), seq, seq, std::stoi(m[1]), seq, seq); } - if (match(m, std::regex(format("vae%c(.*)%cdown_blocks%c(\\d+)%cresnets%c(\\d+)%c(.+)", seq, seq, seq, seq, seq, seq)), key)) { + if (match(m, + std::regex(format("vae%c(.*)%cdown_blocks%c(\\d+)%cresnets%c(\\d+)%c(.+)", seq, seq, seq, seq, seq, seq)), + key)) { std::string suffix = m[3]; if (suffix == "conv_shortcut") { suffix = "nin_shortcut"; @@ -358,7 +376,8 @@ std::string convert_diffusers_name_to_compvis(const std::string& key, char seq) seq, m[0].c_str(), seq, seq, std::stoi(m[1]), seq, seq, m[2].c_str(), seq, suffix.c_str()); } - if (match(m, std::regex(format("vae%c(.*)%cup_blocks%c(\\d+)%cupsamplers%c0%cconv", seq, seq, seq, seq, seq, seq)), key)) { + if (match(m, std::regex(format("vae%c(.*)%cup_blocks%c(\\d+)%cupsamplers%c0%cconv", seq, seq, seq, seq, seq, seq)), + key)) { return format("first_stage_model%c%s%cup%c%d%cupsample%cconv", seq, m[0].c_str(), seq, seq, 3 - std::stoi(m[1]), seq, seq); } @@ -370,7 +389,7 @@ std::string convert_diffusers_name_to_compvis(const std::string& key, char seq) return key; } -std::string convert_tensor_name(const std::string& name) { +std::string convert_tensor_name(const std::string &name) { std::string new_name; if (starts_with(name, "cond_stage_model.") || starts_with(name, "conditioner.embedders.")) { new_name = convert_open_clip_to_hf_clip(name); @@ -380,7 +399,7 @@ std::string convert_tensor_name(const std::string& name) { size_t pos = name.find('.'); if (pos != std::string::npos) { std::string name_without_network_parts = name.substr(5, pos - 5); - std::string network_part = name.substr(pos + 1); + std::string network_part = name.substr(pos + 1); // LOG_DEBUG("%s %s", name_without_network_parts.c_str(), network_part.c_str()); std::string new_key = convert_diffusers_name_to_compvis(name_without_network_parts, '_'); if (new_key.empty()) { @@ -395,7 +414,7 @@ std::string convert_tensor_name(const std::string& name) { size_t pos = name.find_last_of('.'); if (pos != std::string::npos) { std::string name_without_network_parts = name.substr(0, pos); - std::string network_part = name.substr(pos + 1); + std::string network_part = name.substr(pos + 1); // LOG_DEBUG("%s %s", name_without_network_parts.c_str(), network_part.c_str()); std::string new_key = convert_diffusers_name_to_compvis(name_without_network_parts, '.'); if (new_key.empty()) { @@ -416,7 +435,7 @@ std::string convert_tensor_name(const std::string& name) { } void preprocess_tensor(TensorStorage tensor_storage, - std::vector& processed_tensor_storages) { + std::vector &processed_tensor_storages) { std::vector result; std::string new_name = convert_tensor_name(tensor_storage.name); @@ -439,9 +458,9 @@ void preprocess_tensor(TensorStorage tensor_storage, std::string prefix = new_name.substr(0, prefix_size); std::vector chunks = tensor_storage.chunk(3); - chunks[0].name = prefix + "self_attn.q_proj.weight"; - chunks[1].name = prefix + "self_attn.k_proj.weight"; - chunks[2].name = prefix + "self_attn.v_proj.weight"; + chunks[0].name = prefix + "self_attn.q_proj.weight"; + chunks[1].name = prefix + "self_attn.k_proj.weight"; + chunks[2].name = prefix + "self_attn.v_proj.weight"; processed_tensor_storages.insert(processed_tensor_storages.end(), chunks.begin(), chunks.end()); @@ -451,9 +470,9 @@ void preprocess_tensor(TensorStorage tensor_storage, std::string prefix = new_name.substr(0, prefix_size); std::vector chunks = tensor_storage.chunk(3); - chunks[0].name = prefix + "self_attn.q_proj.bias"; - chunks[1].name = prefix + "self_attn.k_proj.bias"; - chunks[2].name = prefix + "self_attn.v_proj.bias"; + chunks[0].name = prefix + "self_attn.q_proj.bias"; + chunks[1].name = prefix + "self_attn.k_proj.bias"; + chunks[2].name = prefix + "self_attn.v_proj.bias"; processed_tensor_storages.insert(processed_tensor_storages.end(), chunks.begin(), chunks.end()); } else { @@ -463,37 +482,38 @@ void preprocess_tensor(TensorStorage tensor_storage, float bf16_to_f32(uint16_t bfloat16) { uint32_t val_bits = (static_cast(bfloat16) << 16); - return *reinterpret_cast(&val_bits); + return *reinterpret_cast(&val_bits); } -void bf16_to_f32_vec(uint16_t* src, float* dst, int64_t n) { +void bf16_to_f32_vec(uint16_t *src, float *dst, int64_t n) { // support inplace op for (int64_t i = n - 1; i >= 0; i--) { dst[i] = bf16_to_f32(src[i]); } } -void convert_tensor(void* src, ggml_type src_type, void* dst, ggml_type dst_type, int n) { +void convert_tensor(void *src, ggml_type src_type, void *dst, ggml_type dst_type, int n) { if (src_type == dst_type) { size_t nbytes = n * ggml_type_size(src_type) / ggml_blck_size(src_type); - memcpy(((char*)dst), ((char*)src), nbytes); + memcpy(((char *) dst), ((char *) src), nbytes); } else if (src_type == GGML_TYPE_F32) { if (dst_type == GGML_TYPE_F16) { - ggml_fp32_to_fp16_row((float*)src, (ggml_fp16_t*)dst, n); + ggml_fp32_to_fp16_row((float *) src, (ggml_fp16_t *) dst, n); } else { int64_t hist[16]; - ggml_quantize_chunk(dst_type, (float*)src, dst, 0, n, hist); + ggml_quantize_chunk(dst_type, (float *) src, dst, 0, n, hist); } } else if (dst_type == GGML_TYPE_F32) { if (src_type == GGML_TYPE_F16) { - ggml_fp16_to_fp32_row((ggml_fp16_t*)src, (float*)dst, n); + ggml_fp16_to_fp32_row((ggml_fp16_t *) src, (float *) dst, n); } else { auto qtype = ggml_internal_get_type_traits(src_type); if (qtype.to_float == NULL) { - throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", - ggml_type_name(src_type))); + throw std::runtime_error( + format("type %s unsupported for integer quantization: no dequantization available", + ggml_type_name(src_type))); } - qtype.to_float(src, (float*)dst, n); + qtype.to_float(src, (float *) dst, n); } } else { // src_type == GGML_TYPE_F16 => dst_type is quantized @@ -505,13 +525,13 @@ void convert_tensor(void* src, ggml_type src_type, void* dst, ggml_type dst_type } std::vector buf; buf.resize(sizeof(float) * n); - char* src_data_f32 = buf.data(); - qtype.to_float(src, (float*)src_data_f32, n); + char *src_data_f32 = buf.data(); + qtype.to_float(src, (float *) src_data_f32, n); if (dst_type == GGML_TYPE_F16) { - ggml_fp32_to_fp16_row((float*)src_data_f32, (ggml_fp16_t*)dst, n); + ggml_fp32_to_fp16_row((float *) src_data_f32, (ggml_fp16_t *) dst, n); } else { int64_t hist[16]; - ggml_quantize_chunk(dst_type, (float*)src_data_f32, dst, 0, n, hist); + ggml_quantize_chunk(dst_type, (float *) src_data_f32, dst, 0, n, hist); } } } @@ -549,7 +569,7 @@ std::map unicode_to_byte() { // byte_decoder = {v: k for k, v in byte_encoder.items()} std::map byte_decoder; - for (const auto& entry : byte_to_unicode) { + for (const auto &entry: byte_to_unicode) { byte_decoder[entry.second] = entry.first; } @@ -558,8 +578,8 @@ std::map unicode_to_byte() { return byte_decoder; } -bool is_zip_file(const std::string& file_path) { - struct zip_t* zip = zip_open(file_path.c_str(), 0, 'r'); +bool is_zip_file(const std::string &file_path) { + struct zip_t *zip = zip_open(file_path.c_str(), 0, 'r'); if (zip == NULL) { return false; } @@ -567,7 +587,7 @@ bool is_zip_file(const std::string& file_path) { return true; } -bool is_gguf_file(const std::string& file_path) { +bool is_gguf_file(const std::string &file_path) { std::ifstream file(file_path, std::ios::binary); if (!file.is_open()) { return false; @@ -588,7 +608,7 @@ bool is_gguf_file(const std::string& file_path) { return true; } -bool is_safetensors_file(const std::string& file_path) { +bool is_safetensors_file(const std::string &file_path) { std::ifstream file(file_path, std::ios::binary); if (!file.is_open()) { return false; @@ -605,7 +625,7 @@ bool is_safetensors_file(const std::string& file_path) { } uint8_t header_size_buf[ST_HEADER_SIZE_LEN]; - file.read((char*)header_size_buf, ST_HEADER_SIZE_LEN); + file.read((char *) header_size_buf, ST_HEADER_SIZE_LEN); if (!file) { return false; } @@ -630,7 +650,7 @@ bool is_safetensors_file(const std::string& file_path) { return true; } -bool ModelLoader::init_from_file(const std::string& file_path, const std::string& prefix) { +bool ModelLoader::init_from_file(const std::string &file_path, const std::string &prefix) { if (is_directory(file_path)) { LOG_INFO("load %s using diffusers format", file_path.c_str()); return init_from_diffusers_file(file_path, prefix); @@ -651,14 +671,14 @@ bool ModelLoader::init_from_file(const std::string& file_path, const std::string /*================================================= GGUFModelLoader ==================================================*/ -bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::string& prefix) { +bool ModelLoader::init_from_gguf_file(const std::string &file_path, const std::string &prefix) { LOG_DEBUG("init from '%s'", file_path.c_str()); file_paths_.push_back(file_path); size_t file_index = file_paths_.size() - 1; - gguf_context* ctx_gguf_ = NULL; - ggml_context* ctx_meta_ = NULL; - ctx_gguf_ = gguf_init_from_file(file_path.c_str(), {true, &ctx_meta_}); + gguf_context *ctx_gguf_ = NULL; + ggml_context *ctx_meta_ = NULL; + ctx_gguf_ = gguf_init_from_file(file_path.c_str(), {true, &ctx_meta_}); if (!ctx_gguf_) { LOG_ERROR("failed to open '%s'", file_path.c_str()); return false; @@ -666,12 +686,12 @@ bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::s int n_tensors = gguf_get_n_tensors(ctx_gguf_); - size_t total_size = 0; + size_t total_size = 0; size_t data_offset = gguf_get_data_offset(ctx_gguf_); for (int i = 0; i < n_tensors; i++) { - std::string name = gguf_get_tensor_name(ctx_gguf_, i); - struct ggml_tensor* dummy = ggml_get_tensor(ctx_meta_, name.c_str()); - size_t offset = data_offset + gguf_get_tensor_offset(ctx_gguf_, i); + std::string name = gguf_get_tensor_name(ctx_gguf_, i); + struct ggml_tensor *dummy = ggml_get_tensor(ctx_meta_, name.c_str()); + size_t offset = data_offset + gguf_get_tensor_offset(ctx_gguf_, i); // LOG_DEBUG("%s", name.c_str()); @@ -690,7 +710,7 @@ bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::s /*================================================= SafeTensorsModelLoader ==================================================*/ -ggml_type str_to_ggml_type(const std::string& dtype) { +ggml_type str_to_ggml_type(const std::string &dtype) { ggml_type ttype = GGML_TYPE_COUNT; if (dtype == "F16") { ttype = GGML_TYPE_F16; @@ -703,7 +723,7 @@ ggml_type str_to_ggml_type(const std::string& dtype) { } // https://huggingface.co/docs/safetensors/index -bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const std::string& prefix) { +bool ModelLoader::init_from_safetensors_file(const std::string &file_path, const std::string &prefix) { LOG_DEBUG("init from '%s'", file_path.c_str()); file_paths_.push_back(file_path); size_t file_index = file_paths_.size() - 1; @@ -725,7 +745,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const } uint8_t header_size_buf[ST_HEADER_SIZE_LEN]; - file.read((char*)header_size_buf, ST_HEADER_SIZE_LEN); + file.read((char *) header_size_buf, ST_HEADER_SIZE_LEN); if (!file) { LOG_ERROR("read safetensors header size failed: '%s'", file_path.c_str()); return false; @@ -749,8 +769,8 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const nlohmann::json header_ = nlohmann::json::parse(header_buf.data()); - for (auto& item : header_.items()) { - std::string name = item.key(); + for (auto &item: header_.items()) { + std::string name = item.key(); nlohmann::json tensor_info = item.value(); // LOG_DEBUG("%s %s\n", name.c_str(), tensor_info.dump().c_str()); @@ -762,11 +782,11 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const continue; } - std::string dtype = tensor_info["dtype"]; + std::string dtype = tensor_info["dtype"]; nlohmann::json shape = tensor_info["shape"]; size_t begin = tensor_info["data_offsets"][0].get(); - size_t end = tensor_info["data_offsets"][1].get(); + size_t end = tensor_info["data_offsets"][1].get(); ggml_type type = str_to_ggml_type(dtype); if (type == GGML_TYPE_COUNT) { @@ -779,13 +799,14 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const return false; } - int n_dims = (int)shape.size(); + int n_dims = (int) shape.size(); int64_t ne[4] = {1, 1, 1, 1}; for (int i = 0; i < n_dims; i++) { ne[i] = shape[i].get(); } - TensorStorage tensor_storage(prefix + name, type, ne, n_dims, file_index, ST_HEADER_SIZE_LEN + header_size_ + begin); + TensorStorage tensor_storage(prefix + name, type, ne, n_dims, file_index, + ST_HEADER_SIZE_LEN + header_size_ + begin); tensor_storage.reverse_ne(); @@ -806,9 +827,9 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const /*================================================= DiffusersModelLoader ==================================================*/ -bool ModelLoader::init_from_diffusers_file(const std::string& file_path, const std::string& prefix) { +bool ModelLoader::init_from_diffusers_file(const std::string &file_path, const std::string &prefix) { std::string unet_path = path_join(file_path, "unet/diffusion_pytorch_model.safetensors"); - std::string vae_path = path_join(file_path, "vae/diffusion_pytorch_model.safetensors"); + std::string vae_path = path_join(file_path, "vae/diffusion_pytorch_model.safetensors"); std::string clip_path = path_join(file_path, "text_encoder/model.safetensors"); if (!init_from_safetensors_file(unet_path, "unet.")) { @@ -923,7 +944,7 @@ struct PickleTensorReader { CHECK_SIZE, READ_DIMENS }; - ReadPhase phase = READ_NAME; + ReadPhase phase = READ_NAME; size_t entry_size = 0; int32_t nelements = 0; @@ -936,14 +957,14 @@ struct PickleTensorReader { if (phase == CHECK_SIZE) { if (entry_size == value * ggml_type_size(tensor_storage.type)) { nelements = value; - phase = READ_DIMENS; + phase = READ_DIMENS; return true; } else { phase = READ_NAME; } } else if (phase == READ_DIMENS) { if (tensor_storage.n_dims + 1 > 4) { // too many dimens - phase = READ_NAME; + phase = READ_NAME; tensor_storage.n_dims = 0; } if (nelements % value == 0) { @@ -954,23 +975,23 @@ struct PickleTensorReader { return false; } - void read_global(const std::string& str) { + void read_global(const std::string &str) { if (str == "FloatStorage") { if (read_global_type) { - global_type = GGML_TYPE_F32; + global_type = GGML_TYPE_F32; read_global_type = false; } tensor_storage.type = GGML_TYPE_F32; } else if (str == "HalfStorage") { if (read_global_type) { - global_type = GGML_TYPE_F16; + global_type = GGML_TYPE_F16; read_global_type = false; } tensor_storage.type = GGML_TYPE_F16; } } - void read_string(const std::string& str, struct zip_t* zip, std::string dir) { + void read_string(const std::string &str, struct zip_t *zip, std::string dir) { if (str == "storage") { read_global_type = true; } else if (str != "state_dict") { @@ -983,8 +1004,8 @@ struct PickleTensorReader { { std::string name = zip_entry_name(zip); if (name == entry_name) { - tensor_storage.index_in_zip = (int)i; - entry_size = zip_entry_size(zip); + tensor_storage.index_in_zip = (int) i; + entry_size = zip_entry_size(zip); zip_entry_close(zip); break; } @@ -996,7 +1017,7 @@ struct PickleTensorReader { } if (!read_global_type && phase == READ_NAME) { tensor_storage.name = str; - phase = READ_DATA; + phase = READ_DATA; tensor_storage.type = global_type; } } @@ -1006,7 +1027,7 @@ struct PickleTensorReader { ggml_type PickleTensorReader::global_type = GGML_TYPE_F32; // all pickle_tensors data type bool PickleTensorReader::read_global_type = false; -int find_char(uint8_t* buffer, int len, char c) { +int find_char(uint8_t *buffer, int len, char c) { for (int pos = 0; pos < len; pos++) { if (buffer[pos] == c) { return pos; @@ -1017,13 +1038,13 @@ int find_char(uint8_t* buffer, int len, char c) { #define MAX_STRING_BUFFER 512 -bool ModelLoader::parse_data_pkl(uint8_t* buffer, +bool ModelLoader::parse_data_pkl(uint8_t *buffer, size_t buffer_size, - zip_t* zip, + zip_t *zip, std::string dir, size_t file_index, - const std::string& prefix) { - uint8_t* buffer_end = buffer + buffer_size; + const std::string &prefix) { + uint8_t *buffer_end = buffer + buffer_size; if (buffer[0] == 0x80) { // proto if (buffer[1] != 2) { LOG_ERROR("Unsupported protocol\n"); @@ -1044,7 +1065,7 @@ bool ModelLoader::parse_data_pkl(uint8_t* buffer, break; case ']': // EMPTY_LIST = b']' # push empty list break; - // skip unused sections + // skip unused sections case 'h': // BINGET = b'h' # " " " " " " ; " " 1-byte arg case 'q': // BINPUT = b'q' # " " " " " ; " " 1-byte arg case 'Q': // BINPERSID = b'Q' # " " " ; " " " " stack @@ -1067,7 +1088,8 @@ bool ModelLoader::parse_data_pkl(uint8_t* buffer, buffer++; } buffer++; - } break; + } + break; case 'M': // BININT2 = b'M' # push 2-byte unsigned int { uint16_t value = read_short(buffer); @@ -1075,7 +1097,8 @@ bool ModelLoader::parse_data_pkl(uint8_t* buffer, buffer++; } buffer += 2; - } break; + } + break; case 'J': // BININT = b'J' # push four-byte signed int { const int32_t value = read_int(buffer); @@ -1083,7 +1106,8 @@ bool ModelLoader::parse_data_pkl(uint8_t* buffer, buffer++; // skip tuple after read num_elements } buffer += 4; - } break; + } + break; case 'X': // BINUNICODE = b'X' # " " " ; counted UTF-8 string argument { const int32_t len = read_int(buffer); @@ -1095,7 +1119,8 @@ bool ModelLoader::parse_data_pkl(uint8_t* buffer, memcpy(string_buffer, buffer, len < MAX_STRING_BUFFER ? len : (MAX_STRING_BUFFER - 1)); buffer += len; reader.read_string(string_buffer, zip, dir); - } break; + } + break; case 0x8C: // SHORT_BINUNICODE = b'\x8c' # push short string; UTF-8 length < 256 bytes { const int8_t len = *buffer; @@ -1104,7 +1129,8 @@ bool ModelLoader::parse_data_pkl(uint8_t* buffer, memcpy(string_buffer, buffer, len); buffer += len; // printf("String: '%s'\n", string_buffer); - } break; + } + break; case 'c': // GLOBAL = b'c' # push self.find_class(modname, name); 2 string args { int len = find_char(buffer, MAX_STRING_BUFFER, '\n'); @@ -1116,14 +1142,15 @@ bool ModelLoader::parse_data_pkl(uint8_t* buffer, memcpy(string_buffer, buffer, len); buffer += len + 1; reader.read_global(string_buffer); - } break; + } + break; case 0x86: // TUPLE2 = b'\x86' # build 2-tuple from two topmost stack items case 0x85: // TUPLE1 = b'\x85' # build 1-tuple from stack top case 't': // TUPLE = b't' # build tuple from topmost stack items if (reader.phase == PickleTensorReader::READ_DIMENS) { reader.tensor_storage.reverse_ne(); reader.tensor_storage.file_index = file_index; - reader.tensor_storage.name = prefix + reader.tensor_storage.name; + reader.tensor_storage.name = prefix + reader.tensor_storage.name; tensor_storages.push_back(reader.tensor_storage); // LOG_DEBUG("%s", reader.tensor_storage.name.c_str()); // reset @@ -1141,31 +1168,31 @@ bool ModelLoader::parse_data_pkl(uint8_t* buffer, return true; } -bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::string& prefix) { +bool ModelLoader::init_from_ckpt_file(const std::string &file_path, const std::string &prefix) { LOG_DEBUG("init from '%s'", file_path.c_str()); file_paths_.push_back(file_path); size_t file_index = file_paths_.size() - 1; - struct zip_t* zip = zip_open(file_path.c_str(), 0, 'r'); + struct zip_t *zip = zip_open(file_path.c_str(), 0, 'r'); if (zip == NULL) { LOG_ERROR("failed to open '%s'", file_path.c_str()); return false; } - int n = (int)zip_entries_total(zip); + int n = (int) zip_entries_total(zip); for (int i = 0; i < n; ++i) { zip_entry_openbyindex(zip, i); { std::string name = zip_entry_name(zip); - size_t pos = name.find("data.pkl"); + size_t pos = name.find("data.pkl"); if (pos != std::string::npos) { std::string dir = name.substr(0, pos); - void* pkl_data = NULL; + void *pkl_data = NULL; size_t pkl_size; zip_entry_read(zip, &pkl_data, &pkl_size); // LOG_DEBUG("%lld", pkl_size); - parse_data_pkl((uint8_t*)pkl_data, pkl_size, zip, dir, file_index, prefix); + parse_data_pkl((uint8_t *) pkl_data, pkl_size, zip, dir, file_index, prefix); free(pkl_data); } @@ -1179,7 +1206,7 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s SDVersion ModelLoader::get_sd_version() { // return VERSION_1_x; TensorStorage token_embedding_weight; - for (auto& tensor_storage : tensor_storages) { + for (auto &tensor_storage: tensor_storages) { if (tensor_storage.name.find("conditioner.embedders.1") != std::string::npos) { return VERSION_XL; } @@ -1205,7 +1232,7 @@ SDVersion ModelLoader::get_sd_version() { } ggml_type ModelLoader::get_sd_wtype() { - for (auto& tensor_storage : tensor_storages) { + for (auto &tensor_storage: tensor_storages) { if (is_unused_tensor(tensor_storage.name)) { continue; } @@ -1219,16 +1246,16 @@ ggml_type ModelLoader::get_sd_wtype() { } std::string ModelLoader::load_merges() { - std::string merges_utf8_str(reinterpret_cast(merges_utf8_c_str), sizeof(merges_utf8_c_str)); + std::string merges_utf8_str(reinterpret_cast(merges_utf8_c_str), sizeof(merges_utf8_c_str)); return merges_utf8_str; } -void remove_duplicates(std::vector& vec) { +void remove_duplicates(std::vector &vec) { std::unordered_map name_to_index_map; for (size_t i = 0; i < vec.size(); ++i) { - const std::string& current_name = vec[i].name; - auto it = name_to_index_map.find(current_name); + const std::string ¤t_name = vec[i].name; + auto it = name_to_index_map.find(current_name); if (it != name_to_index_map.end()) { vec[it->second] = vec[i]; @@ -1242,7 +1269,7 @@ void remove_duplicates(std::vector& vec) { bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend) { std::vector processed_tensor_storages; - for (auto& tensor_storage : tensor_storages) { + for (auto &tensor_storage: tensor_storages) { // LOG_DEBUG("%s", name.c_str()); if (is_unused_tensor(tensor_storage.name)) { @@ -1264,7 +1291,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend } bool is_zip = false; - for (auto& tensor_storage : tensor_storages) { + for (auto &tensor_storage: tensor_storages) { if (tensor_storage.file_index != file_index) { continue; } @@ -1274,7 +1301,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend } } - struct zip_t* zip = NULL; + struct zip_t *zip = NULL; if (is_zip) { zip = zip_open(file_path.c_str(), 0, 'r'); if (zip == NULL) { @@ -1286,16 +1313,16 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend std::vector read_buffer; std::vector convert_buffer; - auto read_data = [&](const TensorStorage& tensor_storage, char* buf, size_t n) { + auto read_data = [&](const TensorStorage &tensor_storage, char *buf, size_t n) { if (zip != NULL) { zip_entry_openbyindex(zip, tensor_storage.index_in_zip); size_t entry_size = zip_entry_size(zip); if (entry_size != n) { read_buffer.resize(entry_size); - zip_entry_noallocread(zip, (void*)read_buffer.data(), entry_size); - memcpy((void*)buf, (void*)(read_buffer.data() + tensor_storage.offset), n); + zip_entry_noallocread(zip, (void *) read_buffer.data(), entry_size); + memcpy((void *) buf, (void *) (read_buffer.data() + tensor_storage.offset), n); } else { - zip_entry_noallocread(zip, (void*)buf, n); + zip_entry_noallocread(zip, (void *) buf, n); } zip_entry_close(zip); } else { @@ -1309,13 +1336,13 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend return true; }; - for (auto& tensor_storage : processed_tensor_storages) { + for (auto &tensor_storage: processed_tensor_storages) { if (tensor_storage.file_index != file_index) { continue; } // LOG_DEBUG("%s", tensor_storage.name.c_str()); - ggml_tensor* dst_tensor = NULL; + ggml_tensor *dst_tensor = NULL; success = on_new_tensor_cb(tensor_storage, &dst_tensor); if (!success) { @@ -1333,35 +1360,38 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend #ifdef SD_USE_METAL || ggml_backend_is_metal(backend) #endif - ) { + ) { // for the CPU and Metal backend, we can copy directly into the tensor if (tensor_storage.type == dst_tensor->type) { GGML_ASSERT(ggml_nbytes(dst_tensor) == tensor_storage.nbytes()); - read_data(tensor_storage, (char*)dst_tensor->data, nbytes_to_read); + read_data(tensor_storage, (char *) dst_tensor->data, nbytes_to_read); if (tensor_storage.is_bf16) { // inplace op - bf16_to_f32_vec((uint16_t*)dst_tensor->data, (float*)dst_tensor->data, tensor_storage.nelements()); + bf16_to_f32_vec((uint16_t *) dst_tensor->data, (float *) dst_tensor->data, + tensor_storage.nelements()); } } else { read_buffer.resize(tensor_storage.nbytes()); - read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read); + read_data(tensor_storage, (char *) read_buffer.data(), nbytes_to_read); if (tensor_storage.is_bf16) { // inplace op - bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); + bf16_to_f32_vec((uint16_t *) read_buffer.data(), (float *) read_buffer.data(), + tensor_storage.nelements()); } - convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data, - dst_tensor->type, (int)tensor_storage.nelements()); + convert_tensor((void *) read_buffer.data(), tensor_storage.type, dst_tensor->data, + dst_tensor->type, (int) tensor_storage.nelements()); } } else { read_buffer.resize(tensor_storage.nbytes()); - read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read); + read_data(tensor_storage, (char *) read_buffer.data(), nbytes_to_read); if (tensor_storage.is_bf16) { // inplace op - bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); + bf16_to_f32_vec((uint16_t *) read_buffer.data(), (float *) read_buffer.data(), + tensor_storage.nelements()); } if (tensor_storage.type == dst_tensor->type) { @@ -1370,9 +1400,9 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend } else { // convert first, then copy to device memory convert_buffer.resize(ggml_nbytes(dst_tensor)); - convert_tensor((void*)read_buffer.data(), tensor_storage.type, - (void*)convert_buffer.data(), dst_tensor->type, - (int)tensor_storage.nelements()); + convert_tensor((void *) read_buffer.data(), tensor_storage.type, + (void *) convert_buffer.data(), dst_tensor->type, + (int) tensor_storage.nelements()); ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor)); } } @@ -1389,35 +1419,41 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend return success; } -bool ModelLoader::load_tensors(std::map& tensors, +bool ModelLoader::load_tensors(std::map &tensors, ggml_backend_t backend, - std::set ignore_tensors) { + std::set ignore_tensors, + bool standalone) { std::set tensor_names_in_file; - auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool { - const std::string& name = tensor_storage.name; + auto on_new_tensor_cb = [&](const TensorStorage &tensor_storage, ggml_tensor **dst_tensor) -> bool { + const std::string &name = tensor_storage.name; tensor_names_in_file.insert(name); - struct ggml_tensor* real; + struct ggml_tensor *real; if (tensors.find(name) != tensors.end()) { real = tensors[name]; } else { if (ignore_tensors.find(name) == ignore_tensors.end()) { - LOG_WARN("unknown tensor '%s' in model file", name.c_str()); + if (standalone) { + LOG_WARN("unknown tensor '%s' in model file", name.c_str()); + } else { + LOG_DEBUG("unknown tensor '%s' in model file", name.c_str()); + } } return true; } if ( - real->ne[0] != tensor_storage.ne[0] || - real->ne[1] != tensor_storage.ne[1] || - real->ne[2] != tensor_storage.ne[2] || - real->ne[3] != tensor_storage.ne[3]) { + real->ne[0] != tensor_storage.ne[0] || + real->ne[1] != tensor_storage.ne[1] || + real->ne[2] != tensor_storage.ne[2] || + real->ne[3] != tensor_storage.ne[3]) { LOG_ERROR( - "tensor '%s' has wrong shape in model file: " - "got [%d, %d, %d, %d], expected [%d, %d, %d, %d]", - name.c_str(), - (int)tensor_storage.ne[0], (int)tensor_storage.ne[1], (int)tensor_storage.ne[2], (int)tensor_storage.ne[3], - (int)real->ne[0], (int)real->ne[1], (int)real->ne[2], (int)real->ne[3]); + "tensor '%s' has wrong shape in model file: " + "got [%d, %d, %d, %d], expected [%d, %d, %d, %d]", + name.c_str(), + (int) tensor_storage.ne[0], (int) tensor_storage.ne[1], (int) tensor_storage.ne[2], + (int) tensor_storage.ne[3], + (int) real->ne[0], (int) real->ne[1], (int) real->ne[2], (int) real->ne[3]); return false; } @@ -1434,7 +1470,7 @@ bool ModelLoader::load_tensors(std::map& tenso bool some_tensor_not_init = false; - for (auto pair : tensors) { + for (auto pair: tensors) { if (pair.first.find("cond_stage_model.transformer.text_model.encoder.layers.23") != std::string::npos) { continue; } @@ -1458,18 +1494,18 @@ bool ModelLoader::load_tensors(std::map& tenso return true; } -bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type) { - auto backend = ggml_backend_cpu_init(); +bool ModelLoader::save_to_gguf_file(const std::string &file_path, ggml_type type) { + auto backend = ggml_backend_cpu_init(); size_t mem_size = 1 * 1024 * 1024; // for padding mem_size += tensor_storages.size() * ggml_tensor_overhead(); mem_size += cal_mem_size(backend, type); LOG_INFO("model tensors mem size: %.2fMB", mem_size / 1024.f / 1024.f); - ggml_context* ggml_ctx = ggml_init({mem_size, NULL, false}); + ggml_context *ggml_ctx = ggml_init({mem_size, NULL, false}); - gguf_context* gguf_ctx = gguf_init_empty(); + gguf_context *gguf_ctx = gguf_init_empty(); - auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool { - const std::string& name = tensor_storage.name; + auto on_new_tensor_cb = [&](const TensorStorage &tensor_storage, ggml_tensor **dst_tensor) -> bool { + const std::string &name = tensor_storage.name; ggml_type tensor_type = tensor_storage.type; if (type != GGML_TYPE_COUNT) { @@ -1480,7 +1516,7 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type } } - ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne); + ggml_tensor *tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne); if (tensor == NULL) { LOG_ERROR("ggml_new_tensor failed"); return false; @@ -1519,14 +1555,14 @@ int64_t ModelLoader::cal_mem_size(ggml_backend_t backend, ggml_type type) { } int64_t mem_size = 0; std::vector processed_tensor_storages; - for (auto& tensor_storage : tensor_storages) { + for (auto &tensor_storage: tensor_storages) { if (is_unused_tensor(tensor_storage.name)) { continue; } preprocess_tensor(tensor_storage, processed_tensor_storages); } - for (auto& tensor_storage : processed_tensor_storages) { + for (auto &tensor_storage: processed_tensor_storages) { ggml_type tensor_type = tensor_storage.type; if (type != GGML_TYPE_COUNT) { if (ggml_is_quantized(type) && tensor_storage.ne[0] % 32 != 0) { @@ -1542,7 +1578,7 @@ int64_t ModelLoader::cal_mem_size(ggml_backend_t backend, ggml_type type) { return mem_size; } -bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type) { +bool convert(const char *input_path, const char *vae_path, const char *output_path, sd_type_t output_type) { ModelLoader model_loader; if (!model_loader.init_from_file(input_path)) { @@ -1556,6 +1592,6 @@ bool convert(const char* input_path, const char* vae_path, const char* output_pa return false; } } - bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type); + bool success = model_loader.save_to_gguf_file(output_path, (ggml_type) output_type); return success; } \ No newline at end of file diff --git a/model.h b/model.h index 4b692a30..b0d61547 100644 --- a/model.h +++ b/model.h @@ -120,7 +120,7 @@ class ModelLoader { bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend); bool load_tensors(std::map& tensors, ggml_backend_t backend, - std::set ignore_tensors = {}); + std::set ignore_tensors = {}, bool standalone=true); bool save_to_gguf_file(const std::string& file_path, ggml_type type); int64_t cal_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT); ~ModelLoader() = default; diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 10e24585..3954e326 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -14,59 +14,60 @@ #include "unet.hpp" #include "vae.hpp" -const char* model_version_to_str[] = { - "1.x", - "2.x", - "XL", +const char *model_version_to_str[] = { + "1.x", + "2.x", + "XL", }; -const char* sampling_methods_str[] = { - "Euler A", - "Euler", - "Heun", - "DPM2", - "DPM++ (2s)", - "DPM++ (2M)", - "modified DPM++ (2M)", - "LCM", +const char *sampling_methods_str[] = { + "Euler A", + "Euler", + "Heun", + "DPM2", + "DPM++ (2s)", + "DPM++ (2M)", + "modified DPM++ (2M)", + "LCM", }; /*================================================== Helper Functions ================================================*/ -void calculate_alphas_cumprod(float* alphas_cumprod, +void calculate_alphas_cumprod(float *alphas_cumprod, float linear_start = 0.00085f, - float linear_end = 0.0120, - int timesteps = TIMESTEPS) { + float linear_end = 0.0120, + int timesteps = TIMESTEPS) { float ls_sqrt = sqrtf(linear_start); float le_sqrt = sqrtf(linear_end); - float amount = le_sqrt - ls_sqrt; + float amount = le_sqrt - ls_sqrt; float product = 1.0f; for (int i = 0; i < timesteps; i++) { - float beta = ls_sqrt + amount * ((float)i / (timesteps - 1)); + float beta = ls_sqrt + amount * ((float) i / (timesteps - 1)); product *= 1.0f - powf(beta, 2.0f); alphas_cumprod[i] = product; } } + /*=============================================== StableDiffusionGGML ================================================*/ class StableDiffusionGGML { public: SDVersion version; - bool vae_decode_only = false; + bool vae_decode_only = false; bool free_params_immediately = false; std::shared_ptr rng = std::make_shared(); - int n_threads = -1; - float scale_factor = 0.18215f; + int n_threads = -1; + float scale_factor = 0.18215f; FrozenCLIPEmbedderWithCustomWords cond_stage_model; UNetModel diffusion_model; AutoEncoderKL first_stage_model; bool use_tiny_autoencoder = false; - bool vae_tiling = false; + bool vae_tiling = false; - std::map tensors; + std::map tensors; std::string lora_model_dir; // lora_name => multiplier @@ -74,45 +75,52 @@ class StableDiffusionGGML { std::map loras; std::shared_ptr denoiser = std::make_shared(); - ggml_backend_t backend = NULL; // general backend - ggml_type model_data_type = GGML_TYPE_COUNT; + schedule_t schedule = DEFAULT; + + ggml_backend_t backend = NULL; // general backend + ggml_type model_data_type = GGML_TYPE_COUNT; // runtime weight type + ggml_type wtype = GGML_TYPE_COUNT; // options weight type TinyAutoEncoder tae_first_stage; std::string taesd_path; + ModelLoader model_loader; + StableDiffusionGGML() = default; StableDiffusionGGML(int n_threads, bool vae_decode_only, bool free_params_immediately, std::string lora_model_dir, - rng_type_t rng_type) - : n_threads(n_threads), - vae_decode_only(vae_decode_only), - free_params_immediately(free_params_immediately), - lora_model_dir(lora_model_dir) { + rng_type_t rng_type, + bool vae_tiling, + ggml_type wtype, + schedule_t schedule, + bool init_backend_immediately = true) + : n_threads(n_threads), + vae_decode_only(vae_decode_only), + free_params_immediately(free_params_immediately), + lora_model_dir(lora_model_dir), + vae_tiling(vae_tiling), + wtype(wtype), + schedule(schedule) { first_stage_model.decode_only = vae_decode_only; - tae_first_stage.decode_only = vae_decode_only; + tae_first_stage.decode_only = vae_decode_only; if (rng_type == STD_DEFAULT_RNG) { rng = std::make_shared(); } else if (rng_type == CUDA_RNG) { rng = std::make_shared(); } + if (init_backend_immediately) { + init_backend(); + } } ~StableDiffusionGGML() { ggml_backend_free(backend); } - bool load_from_file(const std::string& model_path, - const std::string& vae_path, - const std::string& taesd_path, - bool vae_tiling, - ggml_type wtype, - schedule_t schedule) { - this->use_tiny_autoencoder = taesd_path.size() > 0; - this->taesd_path = taesd_path; - this->vae_tiling = vae_tiling; + void init_backend() { #ifdef SD_USE_CUBLAS LOG_DEBUG("Using CUDA backend"); backend = ggml_backend_cuda_init(0); @@ -134,18 +142,43 @@ class StableDiffusionGGML { LOG_INFO("Flash Attention enabled"); #endif #endif - LOG_INFO("loading model from '%s'", model_path.c_str()); - ModelLoader model_loader; + } + + void set_options(int n_threads, + bool vae_decode_only, + bool free_params_immediately, + std::string lora_model_dir, + rng_type_t rng_type, + bool vae_tiling, + sd_type_t wtype, + schedule_t schedule + ) { + this->n_threads = n_threads; + this->vae_decode_only = vae_decode_only; + this->free_params_immediately = free_params_immediately; + this->lora_model_dir = lora_model_dir; + if (rng_type == STD_DEFAULT_RNG) { + rng = std::make_shared(); + } else if (rng_type == CUDA_RNG) { + rng = std::make_shared(); + } + this->vae_tiling = vae_tiling; + this->wtype = (ggml_type) wtype; + this->schedule = schedule; + apply_schedule(); + } - if (!model_loader.init_from_file(model_path)) { - LOG_ERROR("init model loader from file failed: '%s'", model_path.c_str()); + bool load_clip_from_file(const std::string &model_path, bool standalone = true, const std::string &prefix = "te.") { + if (backend == NULL) { + LOG_ERROR("if you set init_backend_immediately false, please call init_backend first"); return false; } - if (vae_path.size() > 0) { - LOG_INFO("loading vae from '%s'", vae_path.c_str()); - if (!model_loader.init_from_file(vae_path, "vae.")) { - LOG_WARN("loading vae from '%s' failed", vae_path.c_str()); + if (!model_path.empty()) { + LOG_INFO("loading clip from '%s'", model_path.c_str()); + if (!model_loader.init_from_file(model_path, prefix)) { + LOG_WARN("loading clip from '%s' failed", model_path.c_str()); + return false; } } @@ -154,18 +187,30 @@ class StableDiffusionGGML { LOG_ERROR("get sd version from file failed: '%s'", model_path.c_str()); return false; } + if (version == VERSION_XL) { scale_factor = 0.13025f; } + cond_stage_model = FrozenCLIPEmbedderWithCustomWords(version); - diffusion_model = UNetModel(version); LOG_INFO("Stable Diffusion %s ", model_version_to_str[version]); + + auto autodiscover_wtype = model_loader.get_sd_wtype(); + if (wtype == GGML_TYPE_COUNT) { - model_data_type = model_loader.get_sd_wtype(); + model_data_type = autodiscover_wtype; } else { - model_data_type = wtype; + if (wtype > autodiscover_wtype) { + LOG_WARN("Stable Diffusion weight type can't set to %s, so set default: %s", + ggml_type_name(wtype), + ggml_type_name(model_data_type)); + model_data_type = autodiscover_wtype; + } else { + model_data_type = wtype; + } } + LOG_INFO("Stable Diffusion weight type: %s", ggml_type_name(model_data_type)); LOG_DEBUG("loading vocab"); @@ -177,99 +222,132 @@ class StableDiffusionGGML { cond_stage_model.tokenizer.load_from_merges(merges_utf8_str); - // create the ggml context for network params - LOG_DEBUG("ggml tensor size = %d bytes", (int)sizeof(ggml_tensor)); + if (!cond_stage_model.alloc_params_buffer(backend, model_data_type)) { + return false; + } - if ( - !cond_stage_model.alloc_params_buffer(backend, model_data_type) || - !diffusion_model.alloc_params_buffer(backend, model_data_type)) { + LOG_DEBUG("preparing memory for clip weights"); + // prepare memory for the weights + { + cond_stage_model.init_params(); + cond_stage_model.map_by_name(tensors, "cond_stage_model."); + } + + struct ggml_init_params params; + params.mem_size = static_cast(3 * 1024) * 1024; // 10M + params.mem_buffer = NULL; + params.no_alloc = false; + // LOG_DEBUG("mem_size %u ", params.mem_size); + struct ggml_context *ctx = ggml_init(params); // for alphas_cumprod and is_using_v_parameterization check + if (!ctx) { + LOG_ERROR("ggml_init() failed"); return false; } - ggml_type vae_type = model_data_type; - if (version == VERSION_XL) { - vae_type = GGML_TYPE_F32; // avoid nan, not work... + // load weights + LOG_DEBUG("loading clip weights"); + int64_t t0 = ggml_time_ms(); + + std::map tensors_need_to_load; + std::set ignore_tensors; + + for (auto &pair: tensors) { + tensors_need_to_load.insert(pair); + } + + bool success = model_loader.load_tensors(tensors_need_to_load, backend, ignore_tensors, standalone); + if (!success) { + LOG_ERROR("load tensors from clip model failed"); + ggml_free(ctx); + return false; + } + + LOG_INFO("clip memory buffer size = %.2fMB", cond_stage_model.params_buffer_size / 1024.0 / 1024.0); + int64_t t1 = ggml_time_ms(); + LOG_INFO("loading clip model from '%s' completed, taking %.2fs", model_path.c_str(), (t1 - t0) * 1.0f / 1000); + ggml_free(ctx); + return true; + } + + void free_clip_params() { + if (cond_stage_model.params_buffer_size > 0) { + cond_stage_model.free_params_buffer(); + } + } + + bool load_unet_from_file(const std::string &model_path, + bool standalone = true, + const std::string &prefix = "unet.") { + if (backend == NULL) { + LOG_ERROR("if you set init_backend_immediately false, please call init_backend first"); + return false; + } + + if (version == VERSION_COUNT) { + LOG_ERROR("get sd version from file failed: '%s' ,make sure clip model has loaded", model_path.c_str()); + return false; + } + + if (!model_path.empty() && standalone) { + LOG_INFO("loading unet from '%s'", model_path.c_str()); + if (!model_loader.init_from_file(model_path, prefix)) { + LOG_WARN("loading unet from '%s' failed", model_path.c_str()); + return false; + } } - if (!use_tiny_autoencoder && !first_stage_model.alloc_params_buffer(backend, vae_type)) { + diffusion_model = UNetModel(version); + if (!diffusion_model.alloc_params_buffer(backend, model_data_type)) { return false; } - LOG_DEBUG("preparing memory for the weights"); + LOG_DEBUG("preparing memory for unet weights"); // prepare memory for the weights { - // cond_stage_model(FrozenCLIPEmbedder) - cond_stage_model.init_params(); - cond_stage_model.map_by_name(tensors, "cond_stage_model."); - // diffusion_model(UNetModel) diffusion_model.init_params(); diffusion_model.map_by_name(tensors, "model.diffusion_model."); - - if (!use_tiny_autoencoder) { - // firest_stage_model(AutoEncoderKL) - first_stage_model.init_params(); - } - first_stage_model.map_by_name(tensors, "first_stage_model."); } struct ggml_init_params params; - params.mem_size = static_cast(10 * 1024) * 1024; // 10M + params.mem_size = static_cast(3 * 1024) * 1024; // 10M params.mem_buffer = NULL; - params.no_alloc = false; - // LOG_DEBUG("mem_size %u ", params.mem_size); - struct ggml_context* ctx = ggml_init(params); // for alphas_cumprod and is_using_v_parameterization check + params.no_alloc = false; + + struct ggml_context *ctx = ggml_init(params); // for alphas_cumprod and is_using_v_parameterization check + if (!ctx) { LOG_ERROR("ggml_init() failed"); return false; } - ggml_tensor* alphas_cumprod_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, TIMESTEPS); - calculate_alphas_cumprod((float*)alphas_cumprod_tensor->data); // load weights LOG_DEBUG("loading weights"); int64_t t0 = ggml_time_ms(); - std::map tensors_need_to_load; + std::map tensors_need_to_load; std::set ignore_tensors; + ggml_tensor *alphas_cumprod_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, TIMESTEPS); + calculate_alphas_cumprod((float *) alphas_cumprod_tensor->data); tensors_need_to_load["alphas_cumprod"] = alphas_cumprod_tensor; - for (auto& pair : tensors) { - const std::string& name = pair.first; - - if (use_tiny_autoencoder && starts_with(name, "first_stage_model.")) { - ignore_tensors.insert(name); - continue; - } - - if (vae_decode_only && (starts_with(name, "first_stage_model.encoder") || starts_with(name, "first_stage_model.quant"))) { + for (auto &pair: tensors) { + const std::string &name = pair.first; + if (starts_with(name, "cond_stage_model.") || starts_with(name, "first_stage_model.")) { ignore_tensors.insert(name); continue; } - tensors_need_to_load.insert(pair); } - bool success = model_loader.load_tensors(tensors_need_to_load, backend, ignore_tensors); + bool success = model_loader.load_tensors(tensors_need_to_load, backend, ignore_tensors, standalone); if (!success) { - LOG_ERROR("load tensors from model loader failed"); + LOG_ERROR("load unet tensors from model loader failed"); ggml_free(ctx); return false; } - - // LOG_DEBUG("model size = %.2fMB", total_size / 1024.0 / 1024.0); - - size_t total_params_size = - cond_stage_model.params_buffer_size + - diffusion_model.params_buffer_size + - first_stage_model.params_buffer_size; - LOG_INFO("total memory buffer size = %.2fMB (clip %.2fMB, unet %.2fMB, vae %.2fMB)", - total_params_size / 1024.0 / 1024.0, - cond_stage_model.params_buffer_size / 1024.0 / 1024.0, - diffusion_model.params_buffer_size / 1024.0 / 1024.0, - first_stage_model.params_buffer_size / 1024.0 / 1024.0); + LOG_INFO("unet memory buffer size = %.2fMB", diffusion_model.params_buffer_size / 1024.0 / 1024.0); int64_t t1 = ggml_time_ms(); - LOG_INFO("loading model from '%s' completed, taking %.2fs", model_path.c_str(), (t1 - t0) * 1.0f / 1000); + LOG_INFO("loading unet model from '%s' completed, taking %.2fs", model_path.c_str(), (t1 - t0) * 1.0f / 1000); - // check is_using_v_parameterization_for_sd2 bool is_using_v_parameterization = false; if (version == VERSION_2_x) { if (is_using_v_parameterization_for_sd2(ctx)) { @@ -284,64 +362,183 @@ class StableDiffusionGGML { LOG_INFO("running in eps-prediction mode"); } - if (schedule != DEFAULT) { - switch (schedule) { - case DISCRETE: - LOG_INFO("running with discrete schedule"); - denoiser->schedule = std::make_shared(); - break; - case KARRAS: - LOG_INFO("running with Karras schedule"); - denoiser->schedule = std::make_shared(); - break; - case DEFAULT: - // Don't touch anything. - break; - default: - LOG_ERROR("Unknown schedule %i", schedule); - abort(); + apply_schedule(); + ggml_free(ctx); + return true; + } + + + void free_unet_params() { + if (diffusion_model.params_buffer_size > 0) { + diffusion_model.free_params_buffer(); + } + } + + bool load_vae_from_file(const std::string &model_path, + bool standalone = true, + const std::string &prefix = "vae.") { + if (backend == NULL) { + LOG_ERROR("if you set init_backend_immediately false, please call init_backend first"); + return false; + } + + if (version == VERSION_COUNT) { + LOG_ERROR("get sd version from file failed: '%s' ,please call load_clip_from_file first", + model_path.c_str()); + return false; + } + + if (!model_path.empty() && standalone) { + LOG_INFO("loading vae from '%s'", model_path.c_str()); + if (!model_loader.init_from_file(model_path, prefix)) { + LOG_WARN("loading vae from '%s' failed", model_path.c_str()); + return false; } } - for (int i = 0; i < TIMESTEPS; i++) { - denoiser->schedule->alphas_cumprod[i] = ((float*)alphas_cumprod_tensor->data)[i]; - denoiser->schedule->sigmas[i] = std::sqrt((1 - denoiser->schedule->alphas_cumprod[i]) / denoiser->schedule->alphas_cumprod[i]); - denoiser->schedule->log_sigmas[i] = std::log(denoiser->schedule->sigmas[i]); + ggml_type vae_type = model_data_type; + if (version == VERSION_XL) { + vae_type = GGML_TYPE_F32; // avoid nan, not work... + } + + if (!first_stage_model.alloc_params_buffer(backend, vae_type)) { + return false; + } + + LOG_DEBUG("preparing memory for vae weights"); + // prepare memory for the weights + { + first_stage_model.init_params(); + first_stage_model.map_by_name(tensors, "first_stage_model."); + } + + struct ggml_init_params params; + params.mem_size = static_cast(10 * 1024) * 1024; // 10M + params.mem_buffer = NULL; + params.no_alloc = false; + // LOG_DEBUG("mem_size %u ", params.mem_size); + struct ggml_context *ctx = ggml_init(params); // for alphas_cumprod and is_using_v_parameterization check + if (!ctx) { + LOG_ERROR("ggml_init() failed"); + return false; + } + + // load weights + LOG_DEBUG("loading weights"); + int64_t t0 = ggml_time_ms(); + + std::map tensors_need_to_load; + std::set ignore_tensors; + for (auto &pair: tensors) { + const std::string &name = pair.first; + // TODO: make it can reload in compute time. so we can set vae_decode_only dynamic. + if (vae_decode_only && + (starts_with(name, "first_stage_model.encoder") || starts_with(name, "first_stage_model.quant"))) { + ignore_tensors.insert(name); + continue; + } + + tensors_need_to_load.insert(pair); } - LOG_DEBUG("finished loaded file"); + bool success = model_loader.load_tensors(tensors_need_to_load, backend, ignore_tensors, standalone); + if (!success) { + LOG_ERROR("load tensors from model loader failed"); + ggml_free(ctx); + return false; + } + LOG_INFO("vae memory buffer size = %.2fMB", first_stage_model.params_buffer_size / 1024.0 / 1024.0); + int64_t t1 = ggml_time_ms(); + LOG_INFO("loading vae model from '%s' completed, taking %.2fs", model_path.c_str(), (t1 - t0) * 1.0f / 1000); ggml_free(ctx); - if (use_tiny_autoencoder) { - return tae_first_stage.load_from_file(taesd_path, backend); + return true; + } + + void free_vae_params() { + if (first_stage_model.params_buffer_size > 0) { + first_stage_model.free_params_buffer(); + } + } + + //load the all model from one file + bool load_diffusions_from_file(const std::string &model_path) { + LOG_INFO("loading model from '%s'", model_path.c_str()); + if (!load_clip_from_file(model_path, false, "")) { + free_clip_params(); + return false; } + + if (!load_unet_from_file(model_path, false, "")) { + free_clip_params(); + free_unet_params(); + return false; + } + + if (!load_vae_from_file(model_path, false, "")) { + free_clip_params(); + free_unet_params(); + free_vae_params(); + return false; + } + return true; } - bool is_using_v_parameterization_for_sd2(ggml_context* work_ctx) { - struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1); + void free_diffusions_params() { + free_clip_params(); + LOG_INFO("free clip params"); + + free_unet_params(); + LOG_INFO("free unet params"); + + free_vae_params(); + LOG_INFO("free vae params"); + } + + bool load_taesd_from_file(const std::string &taesd_path) { + if (first_stage_model.params_buffer_size > 0) { + free_vae_params(); + } + if (taesd_path.empty() || !tae_first_stage.load_from_file(taesd_path, backend)) { + return false; + } + use_tiny_autoencoder = true; + return true; + } + + void free_taesd_params() { + if (tae_first_stage.params_buffer_size > 0) { + tae_first_stage.free_params_buffer(); + } + } + + bool is_using_v_parameterization_for_sd2(ggml_context *work_ctx) { + struct ggml_tensor *x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1); ggml_set_f32(x_t, 0.5); - struct ggml_tensor* c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1); + struct ggml_tensor *c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1); ggml_set_f32(c, 0.5); - struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1); // [N, ] - struct ggml_tensor* t_emb = new_timestep_embedding(work_ctx, NULL, timesteps, diffusion_model.model_channels); // [N, model_channels] + struct ggml_tensor *timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, + 1); // [N, ] + struct ggml_tensor *t_emb = new_timestep_embedding(work_ctx, NULL, timesteps, + diffusion_model.model_channels); // [N, model_channels] int64_t t0 = ggml_time_ms(); ggml_set_f32(timesteps, 999); set_timestep_embedding(timesteps, t_emb, diffusion_model.model_channels); - struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t); + struct ggml_tensor *out = ggml_dup_tensor(work_ctx, x_t); diffusion_model.alloc_compute_buffer(x_t, c, t_emb); diffusion_model.compute(out, n_threads, x_t, NULL, c, t_emb); diffusion_model.free_compute_buffer(); double result = 0.f; { - float* vec_x = (float*)x_t->data; - float* vec_out = (float*)out->data; + float *vec_x = (float *) x_t->data; + float *vec_out = (float *) out->data; int64_t n = ggml_nelements(out); for (int i = 0; i < n; i++) { - result += ((double)vec_out[i] - (double)vec_x[i]); + result += ((double) vec_out[i] - (double) vec_x[i]); } result /= n; } @@ -350,9 +547,39 @@ class StableDiffusionGGML { return result < -1; } - void apply_lora(const std::string& lora_name, float multiplier) { - int64_t t0 = ggml_time_ms(); - std::string st_file_path = path_join(lora_model_dir, lora_name + ".safetensors"); + void apply_schedule() const { + float alphas_cumprod_tensor[TIMESTEPS]; + calculate_alphas_cumprod(alphas_cumprod_tensor); + if (schedule != DEFAULT) { + switch (schedule) { + case DISCRETE: + LOG_INFO("running with discrete schedule"); + denoiser->schedule = std::make_shared(); + break; + case KARRAS: + LOG_INFO("running with Karras schedule"); + denoiser->schedule = std::make_shared(); + break; + case DEFAULT: + // Don't touch anything. + break; + default: + LOG_ERROR("Unknown schedule %i", schedule); + abort(); + } + } + + for (int i = 0; i < TIMESTEPS; i++) { + denoiser->schedule->alphas_cumprod[i] = alphas_cumprod_tensor[i]; + denoiser->schedule->sigmas[i] = std::sqrt( + (1 - denoiser->schedule->alphas_cumprod[i]) / denoiser->schedule->alphas_cumprod[i]); + denoiser->schedule->log_sigmas[i] = std::log(denoiser->schedule->sigmas[i]); + } + } + + void apply_lora(const std::string &lora_name, float multiplier) { + int64_t t0 = ggml_time_ms(); + std::string st_file_path = path_join(lora_model_dir, lora_name + ".safetensors"); std::string ckpt_file_path = path_join(lora_model_dir, lora_name + ".ckpt"); std::string file_path; if (file_exists(st_file_path)) { @@ -360,7 +587,8 @@ class StableDiffusionGGML { } else if (file_exists(ckpt_file_path)) { file_path = ckpt_file_path; } else { - LOG_WARN("can not find %s or %s for lora %s", st_file_path.c_str(), ckpt_file_path.c_str(), lora_name.c_str()); + LOG_WARN("can not find %s or %s for lora %s", st_file_path.c_str(), ckpt_file_path.c_str(), + lora_name.c_str()); return; } LoraModel lora(file_path); @@ -381,14 +609,14 @@ class StableDiffusionGGML { (t1 - t0) * 1.0f / 1000); } - void apply_loras(const std::unordered_map& lora_state) { + void apply_loras(const std::unordered_map &lora_state) { if (lora_state.size() > 0 && model_data_type != GGML_TYPE_F16 && model_data_type != GGML_TYPE_F32) { LOG_WARN("In quantized models when applying LoRA, the images have poor quality."); } std::unordered_map lora_state_diff; - for (auto& kv : lora_state) { - const std::string& lora_name = kv.first; - float multiplier = kv.second; + for (auto &kv: lora_state) { + const std::string &lora_name = kv.first; + float multiplier = kv.second; if (curr_lora_state.find(lora_name) != curr_lora_state.end()) { float curr_multiplier = curr_lora_state[lora_name]; @@ -401,35 +629,35 @@ class StableDiffusionGGML { } } - for (auto& kv : lora_state_diff) { + for (auto &kv: lora_state_diff) { apply_lora(kv.first, kv.second); } curr_lora_state = lora_state; } - std::pair get_learned_condition(ggml_context* work_ctx, - const std::string& text, - int clip_skip, - int width, - int height, - bool force_zero_embeddings = false) { + std::pair get_learned_condition(ggml_context *work_ctx, + const std::string &text, + int clip_skip, + int width, + int height, + bool force_zero_embeddings = false) { cond_stage_model.set_clip_skip(clip_skip); - auto tokens_and_weights = cond_stage_model.tokenize(text, true); - std::vector& tokens = tokens_and_weights.first; - std::vector& weights = tokens_and_weights.second; - int64_t t0 = ggml_time_ms(); - struct ggml_tensor* pooled = NULL; - size_t total_hidden_size = cond_stage_model.text_model.hidden_size; + auto tokens_and_weights = cond_stage_model.tokenize(text, true); + std::vector &tokens = tokens_and_weights.first; + std::vector &weights = tokens_and_weights.second; + int64_t t0 = ggml_time_ms(); + struct ggml_tensor *pooled = NULL; + size_t total_hidden_size = cond_stage_model.text_model.hidden_size; if (version == VERSION_XL) { total_hidden_size += cond_stage_model.text_model2.hidden_size; pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, cond_stage_model.text_model2.projection_dim); } - struct ggml_tensor* hidden_states = ggml_new_tensor_2d(work_ctx, + struct ggml_tensor *hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, total_hidden_size, cond_stage_model.text_model.max_position_embeddings); // [N, n_token, hidden_size] - cond_stage_model.alloc_compute_buffer(work_ctx, (int)tokens.size()); + cond_stage_model.alloc_compute_buffer(work_ctx, (int) tokens.size()); cond_stage_model.compute(n_threads, tokens, hidden_states, pooled); cond_stage_model.free_compute_buffer(); // if (pooled != NULL) { @@ -439,7 +667,7 @@ class StableDiffusionGGML { int64_t t1 = ggml_time_ms(); LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); - ggml_tensor* result = ggml_dup_tensor(work_ctx, hidden_states); + ggml_tensor *result = ggml_dup_tensor(work_ctx, hidden_states); { float original_mean = ggml_tensor_mean(hidden_states); for (int i2 = 0; i2 < hidden_states->ne[2]; i2++) { @@ -455,33 +683,34 @@ class StableDiffusionGGML { ggml_tensor_scale(result, (original_mean / new_mean)); } if (force_zero_embeddings) { - float* vec = (float*)result->data; + float *vec = (float *) result->data; for (int i = 0; i < ggml_nelements(result); i++) { vec[i] = 0; } } - ggml_tensor* vec = NULL; + ggml_tensor *vec = NULL; if (version == VERSION_XL) { int out_dim = 256; - vec = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model.adm_in_channels); + vec = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model.adm_in_channels); // [0:1280] size_t offset = 0; memcpy(vec->data, pooled->data, ggml_nbytes(pooled)); offset += ggml_nbytes(pooled); - struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 2); + struct ggml_tensor *timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 2); // original_size_as_tuple - float orig_width = (float)width; - float orig_height = (float)height; + float orig_width = (float) width; + float orig_height = (float) height; ggml_tensor_set_f32(timesteps, orig_height, 0); ggml_tensor_set_f32(timesteps, orig_width, 1); - ggml_tensor* embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset); + ggml_tensor *embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, + offset); offset += ggml_nbytes(embed_view); set_timestep_embedding(timesteps, embed_view, out_dim); // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2)); // crop_coords_top_left - float crop_coord_top = 0.f; + float crop_coord_top = 0.f; float crop_coord_left = 0.f; ggml_tensor_set_f32(timesteps, crop_coord_top, 0); ggml_tensor_set_f32(timesteps, crop_coord_left, 1); @@ -490,8 +719,8 @@ class StableDiffusionGGML { set_timestep_embedding(timesteps, embed_view, out_dim); // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2)); // target_size_as_tuple - float target_width = (float)width; - float target_height = (float)height; + float target_width = (float) width; + float target_height = (float) height; ggml_tensor_set_f32(timesteps, target_height, 0); ggml_tensor_set_f32(timesteps, target_width, 1); embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset); @@ -504,25 +733,27 @@ class StableDiffusionGGML { return {result, vec}; } - ggml_tensor* sample(ggml_context* work_ctx, - ggml_tensor* x_t, - ggml_tensor* noise, - ggml_tensor* c, - ggml_tensor* c_vector, - ggml_tensor* uc, - ggml_tensor* uc_vector, + ggml_tensor *sample(ggml_context *work_ctx, + ggml_tensor *x_t, + ggml_tensor *noise, + ggml_tensor *c, + ggml_tensor *c_vector, + ggml_tensor *uc, + ggml_tensor *uc_vector, float cfg_scale, sample_method_t method, - const std::vector& sigmas) { + const std::vector &sigmas) { size_t steps = sigmas.size() - 1; // x_t = load_tensor_from_file(work_ctx, "./rand0.bin"); // print_ggml_tensor(x_t); - struct ggml_tensor* x = ggml_dup_tensor(work_ctx, x_t); + struct ggml_tensor *x = ggml_dup_tensor(work_ctx, x_t); copy_ggml_tensor(x, x_t); - struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, x_t); - struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1); // [N, ] - struct ggml_tensor* t_emb = new_timestep_embedding(work_ctx, NULL, timesteps, diffusion_model.model_channels); // [N, model_channels] + struct ggml_tensor *noised_input = ggml_dup_tensor(work_ctx, x_t); + struct ggml_tensor *timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, + 1); // [N, ] + struct ggml_tensor *t_emb = new_timestep_embedding(work_ctx, NULL, timesteps, + diffusion_model.model_channels); // [N, model_channels] diffusion_model.alloc_compute_buffer(noised_input, c, t_emb, c_vector); bool has_unconditioned = cfg_scale != 1.0 && uc != NULL; @@ -537,31 +768,31 @@ class StableDiffusionGGML { } // denoise wrapper - struct ggml_tensor* out_cond = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor* out_uncond = NULL; + struct ggml_tensor *out_cond = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *out_uncond = NULL; if (has_unconditioned) { out_uncond = ggml_dup_tensor(work_ctx, x); } - struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *denoised = ggml_dup_tensor(work_ctx, x); - auto denoise = [&](ggml_tensor* input, float sigma, int step) { + auto denoise = [&](ggml_tensor *input, float sigma, int step) { if (step == 1) { - pretty_progress(0, (int)steps, 0); + pretty_progress(0, (int) steps, 0); } int64_t t0 = ggml_time_us(); - float c_skip = 1.0f; - float c_out = 1.0f; - float c_in = 1.0f; + float c_skip = 1.0f; + float c_out = 1.0f; + float c_in = 1.0f; std::vector scaling = denoiser->get_scalings(sigma); if (scaling.size() == 3) { // CompVisVDenoiser c_skip = scaling[0]; - c_out = scaling[1]; - c_in = scaling[2]; + c_out = scaling[1]; + c_in = scaling[2]; } else { // CompVisDenoiser c_out = scaling[0]; - c_in = scaling[1]; + c_in = scaling[1]; } float t = denoiser->schedule->sigma_to_t(sigma); @@ -575,16 +806,16 @@ class StableDiffusionGGML { // cond diffusion_model.compute(out_cond, n_threads, noised_input, NULL, c, t_emb, c_vector); - float* negative_data = NULL; + float *negative_data = NULL; if (has_unconditioned) { // uncond diffusion_model.compute(out_uncond, n_threads, noised_input, NULL, uc, t_emb, uc_vector); - negative_data = (float*)out_uncond->data; + negative_data = (float *) out_uncond->data; } - float* vec_denoised = (float*)denoised->data; - float* vec_input = (float*)input->data; - float* positive_data = (float*)out_cond->data; - int ne_elements = (int)ggml_nelements(denoised); + float *vec_denoised = (float *) denoised->data; + float *vec_input = (float *) input->data; + float *positive_data = (float *) out_cond->data; + int ne_elements = (int) ggml_nelements(denoised); for (int i = 0; i < ne_elements; i++) { float latent_result = positive_data[i]; if (has_unconditioned) { @@ -597,7 +828,7 @@ class StableDiffusionGGML { } int64_t t1 = ggml_time_us(); if (step > 0) { - pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f); + pretty_progress(step, (int) steps, (t1 - t0) / 1000000.f); // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000); } }; @@ -605,8 +836,8 @@ class StableDiffusionGGML { // sample_euler_ancestral switch (method) { case EULER_A: { - struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x); for (int i = 0; i < steps; i++) { float sigma = sigmas[i]; @@ -616,9 +847,9 @@ class StableDiffusionGGML { // d = (x - denoised) / sigma { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; + float *vec_d = (float *) d->data; + float *vec_x = (float *) x->data; + float *vec_denoised = (float *) denoised->data; for (int i = 0; i < ggml_nelements(d); i++) { vec_d[i] = (vec_x[i] - vec_denoised[i]) / sigma; @@ -626,16 +857,18 @@ class StableDiffusionGGML { } // get_ancestral_step - float sigma_up = std::min(sigmas[i + 1], - std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i]))); + float sigma_up = std::min(sigmas[i + 1], + std::sqrt(sigmas[i + 1] * sigmas[i + 1] * + (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / + (sigmas[i] * sigmas[i]))); float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up); // Euler method float dt = sigma_down - sigmas[i]; // x = x + d * dt { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; + float *vec_d = (float *) d->data; + float *vec_x = (float *) x->data; for (int i = 0; i < ggml_nelements(x); i++) { vec_x[i] = vec_x[i] + vec_d[i] * dt; @@ -647,8 +880,8 @@ class StableDiffusionGGML { ggml_tensor_set_f32_randn(noise, rng); // noise = load_tensor_from_file(work_ctx, "./rand" + std::to_string(i+1) + ".bin"); { - float* vec_x = (float*)x->data; - float* vec_noise = (float*)noise->data; + float *vec_x = (float *) x->data; + float *vec_noise = (float *) noise->data; for (int i = 0; i < ggml_nelements(x); i++) { vec_x[i] = vec_x[i] + vec_noise[i] * sigma_up; @@ -656,10 +889,11 @@ class StableDiffusionGGML { } } } - } break; + } + break; case EULER: // Implemented without any sigma churn { - struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x); for (int i = 0; i < steps; i++) { float sigma = sigmas[i]; @@ -669,9 +903,9 @@ class StableDiffusionGGML { // d = (x - denoised) / sigma { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; + float *vec_d = (float *) d->data; + float *vec_x = (float *) x->data; + float *vec_denoised = (float *) denoised->data; for (int j = 0; j < ggml_nelements(d); j++) { vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigma; @@ -681,18 +915,19 @@ class StableDiffusionGGML { float dt = sigmas[i + 1] - sigma; // x = x + d * dt { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; + float *vec_d = (float *) d->data; + float *vec_x = (float *) x->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_x[j] = vec_x[j] + vec_d[j] * dt; } } } - } break; + } + break; case HEUN: { - struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *x2 = ggml_dup_tensor(work_ctx, x); for (int i = 0; i < steps; i++) { // denoise @@ -700,9 +935,9 @@ class StableDiffusionGGML { // d = (x - denoised) / sigma { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; + float *vec_d = (float *) d->data; + float *vec_x = (float *) x->data; + float *vec_denoised = (float *) denoised->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i]; @@ -713,25 +948,25 @@ class StableDiffusionGGML { if (sigmas[i + 1] == 0) { // Euler step // x = x + d * dt - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; + float *vec_d = (float *) d->data; + float *vec_x = (float *) x->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_x[j] = vec_x[j] + vec_d[j] * dt; } } else { // Heun step - float* vec_d = (float*)d->data; - float* vec_d2 = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_x2 = (float*)x2->data; + float *vec_d = (float *) d->data; + float *vec_d2 = (float *) d->data; + float *vec_x = (float *) x->data; + float *vec_x2 = (float *) x2->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_x2[j] = vec_x[j] + vec_d[j] * dt; } denoise(x2, sigmas[i + 1], i + 1); - float* vec_denoised = (float*)denoised->data; + float *vec_denoised = (float *) denoised->data; for (int j = 0; j < ggml_nelements(x); j++) { float d2 = (vec_x2[j] - vec_denoised[j]) / sigmas[i + 1]; vec_d[j] = (vec_d[j] + d2) / 2; @@ -739,10 +974,11 @@ class StableDiffusionGGML { } } } - } break; + } + break; case DPM2: { - struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *x2 = ggml_dup_tensor(work_ctx, x); for (int i = 0; i < steps; i++) { // denoise @@ -750,9 +986,9 @@ class StableDiffusionGGML { // d = (x - denoised) / sigma { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; + float *vec_d = (float *) d->data; + float *vec_x = (float *) x->data; + float *vec_denoised = (float *) denoised->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i]; @@ -762,9 +998,9 @@ class StableDiffusionGGML { if (sigmas[i + 1] == 0) { // Euler step // x = x + d * dt - float dt = sigmas[i + 1] - sigmas[i]; - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; + float dt = sigmas[i + 1] - sigmas[i]; + float *vec_d = (float *) d->data; + float *vec_x = (float *) x->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_x[j] = vec_x[j] + vec_d[j] * dt; @@ -772,18 +1008,18 @@ class StableDiffusionGGML { } else { // DPM-Solver-2 float sigma_mid = exp(0.5f * (log(sigmas[i]) + log(sigmas[i + 1]))); - float dt_1 = sigma_mid - sigmas[i]; - float dt_2 = sigmas[i + 1] - sigmas[i]; + float dt_1 = sigma_mid - sigmas[i]; + float dt_2 = sigmas[i + 1] - sigmas[i]; - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_x2 = (float*)x2->data; + float *vec_d = (float *) d->data; + float *vec_x = (float *) x->data; + float *vec_x2 = (float *) x2->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_x2[j] = vec_x[j] + vec_d[j] * dt_1; } denoise(x2, sigma_mid, i + 1); - float* vec_denoised = (float*)denoised->data; + float *vec_denoised = (float *) denoised->data; for (int j = 0; j < ggml_nelements(x); j++) { float d2 = (vec_x2[j] - vec_denoised[j]) / sigma_mid; vec_x[j] = vec_x[j] + d2 * dt_2; @@ -791,28 +1027,31 @@ class StableDiffusionGGML { } } - } break; + } + break; case DPMPP2S_A: { - struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *x2 = ggml_dup_tensor(work_ctx, x); for (int i = 0; i < steps; i++) { // denoise denoise(x, sigmas[i], i + 1); // get_ancestral_step - float sigma_up = std::min(sigmas[i + 1], - std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i]))); + float sigma_up = std::min(sigmas[i + 1], + std::sqrt(sigmas[i + 1] * sigmas[i + 1] * + (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / + (sigmas[i] * sigmas[i]))); float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up); - auto t_fn = [](float sigma) -> float { return -log(sigma); }; - auto sigma_fn = [](float t) -> float { return exp(-t); }; + auto t_fn = [](float sigma) -> float { return -log(sigma); }; + auto sigma_fn = [](float t) -> float { return exp(-t); }; if (sigma_down == 0) { // Euler step - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; + float *vec_d = (float *) d->data; + float *vec_x = (float *) x->data; + float *vec_denoised = (float *) denoised->data; for (int j = 0; j < ggml_nelements(d); j++) { vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i]; @@ -828,15 +1067,15 @@ class StableDiffusionGGML { } } else { // DPM-Solver++(2S) - float t = t_fn(sigmas[i]); + float t = t_fn(sigmas[i]); float t_next = t_fn(sigma_down); - float h = t_next - t; - float s = t + 0.5f * h; + float h = t_next - t; + float s = t + 0.5f * h; - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_x2 = (float*)x2->data; - float* vec_denoised = (float*)denoised->data; + float *vec_d = (float *) d->data; + float *vec_x = (float *) x->data; + float *vec_x2 = (float *) x2->data; + float *vec_denoised = (float *) denoised->data; // First half-step for (int j = 0; j < ggml_nelements(x); j++) { @@ -855,8 +1094,8 @@ class StableDiffusionGGML { if (sigmas[i + 1] > 0) { ggml_tensor_set_f32_randn(noise, rng); { - float* vec_x = (float*)x->data; - float* vec_noise = (float*)noise->data; + float *vec_x = (float *) x->data; + float *vec_noise = (float *) noise->data; for (int i = 0; i < ggml_nelements(x); i++) { vec_x[i] = vec_x[i] + vec_noise[i] * sigma_up; @@ -864,10 +1103,11 @@ class StableDiffusionGGML { } } } - } break; + } + break; case DPMPP2M: // DPM++ (2M) from Karras et al (2022) { - struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *old_denoised = ggml_dup_tensor(work_ctx, x); auto t_fn = [](float sigma) -> float { return -log(sigma); }; @@ -875,14 +1115,14 @@ class StableDiffusionGGML { // denoise denoise(x, sigmas[i], i + 1); - float t = t_fn(sigmas[i]); - float t_next = t_fn(sigmas[i + 1]); - float h = t_next - t; - float a = sigmas[i + 1] / sigmas[i]; - float b = exp(-h) - 1.f; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; - float* vec_old_denoised = (float*)old_denoised->data; + float t = t_fn(sigmas[i]); + float t_next = t_fn(sigmas[i + 1]); + float h = t_next - t; + float a = sigmas[i + 1] / sigmas[i]; + float b = exp(-h) - 1.f; + float *vec_x = (float *) x->data; + float *vec_denoised = (float *) denoised->data; + float *vec_old_denoised = (float *) old_denoised->data; if (i == 0 || sigmas[i + 1] == 0) { // Simpler step for the edge cases @@ -891,10 +1131,11 @@ class StableDiffusionGGML { } } else { float h_last = t - t_fn(sigmas[i - 1]); - float r = h_last / h; + float r = h_last / h; for (int j = 0; j < ggml_nelements(x); j++) { - float denoised_d = (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j]; - vec_x[j] = a * vec_x[j] - b * denoised_d; + float denoised_d = + (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j]; + vec_x[j] = a * vec_x[j] - b * denoised_d; } } @@ -903,10 +1144,11 @@ class StableDiffusionGGML { vec_old_denoised[j] = vec_denoised[j]; } } - } break; + } + break; case DPMPP2Mv2: // Modified DPM++ (2M) from https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457 { - struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *old_denoised = ggml_dup_tensor(work_ctx, x); auto t_fn = [](float sigma) -> float { return -log(sigma); }; @@ -914,13 +1156,13 @@ class StableDiffusionGGML { // denoise denoise(x, sigmas[i], i + 1); - float t = t_fn(sigmas[i]); - float t_next = t_fn(sigmas[i + 1]); - float h = t_next - t; - float a = sigmas[i + 1] / sigmas[i]; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; - float* vec_old_denoised = (float*)old_denoised->data; + float t = t_fn(sigmas[i]); + float t_next = t_fn(sigmas[i + 1]); + float h = t_next - t; + float a = sigmas[i + 1] / sigmas[i]; + float *vec_x = (float *) x->data; + float *vec_denoised = (float *) denoised->data; + float *vec_old_denoised = (float *) old_denoised->data; if (i == 0 || sigmas[i + 1] == 0) { // Simpler step for the edge cases @@ -930,14 +1172,15 @@ class StableDiffusionGGML { } } else { float h_last = t - t_fn(sigmas[i - 1]); - float h_min = std::min(h_last, h); - float h_max = std::max(h_last, h); - float r = h_max / h_min; - float h_d = (h_max + h_min) / 2.f; - float b = exp(-h_d) - 1.f; + float h_min = std::min(h_last, h); + float h_max = std::max(h_last, h); + float r = h_max / h_min; + float h_d = (h_max + h_min) / 2.f; + float b = exp(-h_d) - 1.f; for (int j = 0; j < ggml_nelements(x); j++) { - float denoised_d = (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j]; - vec_x[j] = a * vec_x[j] - b * denoised_d; + float denoised_d = + (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j]; + vec_x[j] = a * vec_x[j] - b * denoised_d; } } @@ -946,11 +1189,12 @@ class StableDiffusionGGML { vec_old_denoised[j] = vec_denoised[j]; } } - } break; + } + break; case LCM: // Latent Consistency Models { - struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x); for (int i = 0; i < steps; i++) { float sigma = sigmas[i]; @@ -960,8 +1204,8 @@ class StableDiffusionGGML { // x = denoised { - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; + float *vec_x = (float *) x->data; + float *vec_denoised = (float *) denoised->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_x[j] = vec_denoised[j]; } @@ -972,8 +1216,8 @@ class StableDiffusionGGML { ggml_tensor_set_f32_randn(noise, rng); // noise = load_tensor_from_file(res_ctx, "./rand" + std::to_string(i+1) + ".bin"); { - float* vec_x = (float*)x->data; - float* vec_noise = (float*)noise->data; + float *vec_x = (float *) x->data; + float *vec_noise = (float *) noise->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_x[j] = vec_x[j] + sigmas[i + 1] * vec_noise[j]; @@ -981,7 +1225,8 @@ class StableDiffusionGGML { } } } - } break; + } + break; default: LOG_ERROR("Attempting to sample with nonexisting sample method %i", method); @@ -992,27 +1237,28 @@ class StableDiffusionGGML { } // ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding - ggml_tensor* get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) { + ggml_tensor *get_first_stage_encoding(ggml_context *work_ctx, ggml_tensor *moments) { // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample - ggml_tensor* latent = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]); - struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latent); + ggml_tensor *latent = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], + moments->ne[2] / 2, moments->ne[3]); + struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, latent); ggml_tensor_set_f32_randn(noise, rng); // noise = load_tensor_from_file(work_ctx, "noise.bin"); { - float mean = 0; + float mean = 0; float logvar = 0; - float value = 0; - float std_ = 0; + float value = 0; + float std_ = 0; for (int i = 0; i < latent->ne[3]; i++) { for (int j = 0; j < latent->ne[2]; j++) { for (int k = 0; k < latent->ne[1]; k++) { for (int l = 0; l < latent->ne[0]; l++) { - mean = ggml_tensor_get_f32(moments, l, k, j, i); - logvar = ggml_tensor_get_f32(moments, l, k, j + (int)latent->ne[2], i); + mean = ggml_tensor_get_f32(moments, l, k, j, i); + logvar = ggml_tensor_get_f32(moments, l, k, j + (int) latent->ne[2], i); logvar = std::max(-30.0f, std::min(logvar, 20.0f)); - std_ = std::exp(0.5f * logvar); - value = mean + std_ * ggml_tensor_get_f32(noise, l, k, j, i); - value = value * scale_factor; + std_ = std::exp(0.5f * logvar); + value = mean + std_ * ggml_tensor_get_f32(noise, l, k, j, i); + value = value * scale_factor; // printf("%d %d %d %d -> %f\n", i, j, k, l, value); ggml_tensor_set_f32(latent, value, l, k, j, i); } @@ -1023,14 +1269,14 @@ class StableDiffusionGGML { return latent; } - ggml_tensor* compute_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode) { - int64_t W = x->ne[0]; - int64_t H = x->ne[1]; - ggml_tensor* result = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, + ggml_tensor *compute_first_stage(ggml_context *work_ctx, ggml_tensor *x, bool decode) { + int64_t W = x->ne[0]; + int64_t H = x->ne[1]; + ggml_tensor *result = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, decode ? (W * 8) : (W / 8), // width decode ? (H * 8) : (H / 8), // height decode ? 3 : (use_tiny_autoencoder ? 4 : 8)); // channels - int64_t t0 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); if (!use_tiny_autoencoder) { if (decode) { ggml_tensor_scale(x, 1.0f / scale_factor); @@ -1039,7 +1285,7 @@ class StableDiffusionGGML { } if (vae_tiling && decode) { // TODO: support tiling vae encode // split latent in 32x32 tiles and compute in several steps - auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { + auto on_tiling = [&](ggml_tensor *in, ggml_tensor *out, bool init) { if (init) { first_stage_model.alloc_compute_buffer(in, decode); } else { @@ -1058,7 +1304,7 @@ class StableDiffusionGGML { } else { if (vae_tiling && decode) { // TODO: support tiling vae encode // split latent in 64x64 tiles and compute in several steps - auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { + auto on_tiling = [&](ggml_tensor *in, ggml_tensor *out, bool init) { if (init) { tae_first_stage.alloc_compute_buffer(in, decode); } else { @@ -1073,18 +1319,19 @@ class StableDiffusionGGML { tae_first_stage.free_compute_buffer(); } int64_t t1 = ggml_time_ms(); - LOG_DEBUG("computing vae [mode: %s] graph completed, taking %.2fs", decode ? "DECODE" : "ENCODE", (t1 - t0) * 1.0f / 1000); + LOG_DEBUG("computing vae [mode: %s] graph completed, taking %.2fs", decode ? "DECODE" : "ENCODE", + (t1 - t0) * 1.0f / 1000); if (decode) { ggml_tensor_clamp(result, 0.0f, 1.0f); } return result; } - ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x) { + ggml_tensor *encode_first_stage(ggml_context *work_ctx, ggml_tensor *x) { return compute_first_stage(work_ctx, x, false); } - ggml_tensor* decode_first_stage(ggml_context* work_ctx, ggml_tensor* x) { + ggml_tensor *decode_first_stage(ggml_context *work_ctx, ggml_tensor *x) { return compute_first_stage(work_ctx, x, true); } }; @@ -1092,53 +1339,38 @@ class StableDiffusionGGML { /*================================================= SD API ==================================================*/ struct sd_ctx_t { - StableDiffusionGGML* sd = NULL; + StableDiffusionGGML *sd = NULL; }; -sd_ctx_t* new_sd_ctx(const char* model_path_c_str, - const char* vae_path_c_str, - const char* taesd_path_c_str, - const char* lora_model_dir_c_str, +sd_ctx_t *new_sd_ctx(int n_threads, bool vae_decode_only, - bool vae_tiling, bool free_params_immediately, - int n_threads, - enum sd_type_t wtype, + const char *lora_model_dir_c_str, enum rng_type_t rng_type, - enum schedule_t s) { - sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t)); + bool vae_tiling, + enum sd_type_t wtype, + enum schedule_t s, + bool init_backend_immediately) { + sd_ctx_t *sd_ctx = (sd_ctx_t *) malloc(sizeof(sd_ctx_t)); if (sd_ctx == NULL) { return NULL; } - std::string model_path(model_path_c_str); - std::string vae_path(vae_path_c_str); - std::string taesd_path(taesd_path_c_str); std::string lora_model_dir(lora_model_dir_c_str); sd_ctx->sd = new StableDiffusionGGML(n_threads, vae_decode_only, free_params_immediately, lora_model_dir, - rng_type); - if (sd_ctx->sd == NULL) { - return NULL; - } - - if (!sd_ctx->sd->load_from_file(model_path, - vae_path, - taesd_path, - vae_tiling, - (ggml_type)wtype, - s)) { - delete sd_ctx->sd; - sd_ctx->sd = NULL; - free(sd_ctx); - return NULL; - } + rng_type, + vae_tiling, + static_cast(wtype), + s, + init_backend_immediately + ); return sd_ctx; } -void free_sd_ctx(sd_ctx_t* sd_ctx) { +void free_sd_ctx(sd_ctx_t *sd_ctx) { if (sd_ctx->sd != NULL) { delete sd_ctx->sd; sd_ctx->sd = NULL; @@ -1146,9 +1378,125 @@ void free_sd_ctx(sd_ctx_t* sd_ctx) { free(sd_ctx); } -sd_image_t* txt2img(sd_ctx_t* sd_ctx, - const char* prompt_c_str, - const char* negative_prompt_c_str, +void init_backend(sd_ctx_t *sd_ctx) { + if (sd_ctx == NULL || sd_ctx->sd == NULL) { + LOG_ERROR("must call new_sd_ctx first"); + return; + } + sd_ctx->sd->init_backend(); +} + +void set_options(sd_ctx_t *sd_ctx, + int n_threads, + bool vae_decode_only, + bool free_params_immediately, + const char *lora_model_dir, + rng_type_t rng_type, + bool vae_tiling, + sd_type_t wtype, + schedule_t schedule +) { + if (sd_ctx == NULL || sd_ctx->sd == NULL) { + LOG_ERROR("must call new_sd_ctx first"); + return; + } + sd_ctx->sd->set_options( + n_threads, + vae_decode_only, + free_params_immediately, + std::string(lora_model_dir), + rng_type, + vae_tiling, + wtype, + schedule + ); +} + +bool load_clip_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix) { + if (sd_ctx == NULL || sd_ctx->sd == NULL) { + LOG_ERROR("must call new_sd_ctx first"); + return false; + } + return sd_ctx->sd->load_clip_from_file(std::string(model_path), true,std::string(prefix)); +} + +void free_clip_params(sd_ctx_t *sd_ctx) { + if (sd_ctx == NULL || sd_ctx->sd == NULL) { + LOG_ERROR("must call new_sd_ctx first"); + return; + } + sd_ctx->sd->free_clip_params(); +} + +bool load_unet_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix) { + if (sd_ctx == NULL || sd_ctx->sd == NULL) { + LOG_ERROR("must call new_sd_ctx first"); + return false; + } + return sd_ctx->sd->load_unet_from_file(std::string(model_path), true, std::string(prefix)); +} + +void free_unet_params(sd_ctx_t *sd_ctx) { + if (sd_ctx == NULL || sd_ctx->sd == NULL) { + LOG_ERROR("must call new_sd_ctx first"); + return; + } + sd_ctx->sd->free_unet_params(); +} + +bool load_vae_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix) { + if (sd_ctx == NULL || sd_ctx->sd == NULL) { + LOG_ERROR("must call new_sd_ctx first"); + return false; + } + return sd_ctx->sd->load_vae_from_file(std::string(model_path), true, std::string(prefix)); +} + +void free_vae_params(sd_ctx_t *sd_ctx) { + if (sd_ctx == NULL || sd_ctx->sd == NULL) { + LOG_ERROR("must call new_sd_ctx first"); + return; + } + sd_ctx->sd->free_vae_params(); +} + +bool load_taesd_from_file(sd_ctx_t *sd_ctx, const char *model_path) { + if (sd_ctx == NULL || sd_ctx->sd == NULL) { + LOG_ERROR("must call new_sd_ctx first"); + return false; + } + return sd_ctx->sd->load_taesd_from_file(std::string(model_path)); +} + +void free_taesd_params(sd_ctx_t *sd_ctx) { + if (sd_ctx == NULL || sd_ctx->sd == NULL) { + LOG_ERROR("must call new_sd_ctx first"); + return; + } + sd_ctx->sd->free_taesd_params(); +} + +// load all model from one file +bool load_diffusions_from_file(sd_ctx_t *sd_ctx, const char *model_path) { + if (sd_ctx == NULL || sd_ctx->sd == NULL) { + LOG_ERROR("must call new_sd_ctx first"); + return false; + } + return sd_ctx->sd->load_diffusions_from_file(std::string(model_path)); +} + +// free all model from one file +void free_diffusions_params(sd_ctx_t *sd_ctx) { + if (sd_ctx == NULL || sd_ctx->sd == NULL) { + LOG_ERROR("must call new_sd_ctx first"); + return; + } + return sd_ctx->sd->free_diffusions_params(); +} + +sd_image_t *txt2img(sd_ctx_t *sd_ctx, + const char *prompt_c_str, + const char *negative_prompt_c_str, int clip_skip, float cfg_scale, int width, @@ -1166,10 +1514,10 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, std::string negative_prompt(negative_prompt_c_str); // extract and remove lora - auto result_pair = extract_and_remove_lora(prompt); + auto result_pair = extract_and_remove_lora(prompt); std::unordered_map lora_f2m = result_pair.first; // lora_name -> multiplier - for (auto& kv : lora_f2m) { + for (auto &kv: lora_f2m) { LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second); } @@ -1185,10 +1533,10 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, params.mem_size += width * height * 3 * sizeof(float); params.mem_size *= batch_count; params.mem_buffer = NULL; - params.no_alloc = false; + params.no_alloc = false; // LOG_DEBUG("mem_size %u ", params.mem_size); - struct ggml_context* work_ctx = ggml_init(params); + struct ggml_context *work_ctx = ggml_init(params); if (!work_ctx) { LOG_ERROR("ggml_init() failed"); return NULL; @@ -1198,24 +1546,25 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, // Generally, when using the provided command line, the seed is always >0. // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library // by a third party with a seed <0, let's incorporate randomization here. - srand((int)time(NULL)); + srand((int) time(NULL)); seed = rand(); } - t0 = ggml_time_ms(); - auto cond_pair = sd_ctx->sd->get_learned_condition(work_ctx, prompt, clip_skip, width, height); - ggml_tensor* c = cond_pair.first; - ggml_tensor* c_vector = cond_pair.second; // [adm_in_channels, ] - struct ggml_tensor* uc = NULL; - struct ggml_tensor* uc_vector = NULL; + t0 = ggml_time_ms(); + auto cond_pair = sd_ctx->sd->get_learned_condition(work_ctx, prompt, clip_skip, width, height); + ggml_tensor *c = cond_pair.first; + ggml_tensor *c_vector = cond_pair.second; // [adm_in_channels, ] + struct ggml_tensor *uc = NULL; + struct ggml_tensor *uc_vector = NULL; if (cfg_scale != 1.0) { bool force_zero_embeddings = false; if (sd_ctx->sd->version == VERSION_XL && negative_prompt.size() == 0) { force_zero_embeddings = true; } - auto uncond_pair = sd_ctx->sd->get_learned_condition(work_ctx, negative_prompt, clip_skip, width, height, force_zero_embeddings); - uc = uncond_pair.first; - uc_vector = uncond_pair.second; // [adm_in_channels, ] + auto uncond_pair = sd_ctx->sd->get_learned_condition(work_ctx, negative_prompt, clip_skip, width, height, + force_zero_embeddings); + uc = uncond_pair.first; + uc_vector = uncond_pair.second; // [adm_in_channels, ] } t1 = ggml_time_ms(); LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t1 - t0); @@ -1224,23 +1573,24 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, sd_ctx->sd->cond_stage_model.free_params_buffer(); } - std::vector final_latents; // collect latents to decode + std::vector final_latents; // collect latents to decode int C = 4; int W = width / 8; int H = height / 8; LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]); for (int b = 0; b < batch_count; b++) { int64_t sampling_start = ggml_time_ms(); - int64_t cur_seed = seed + b; + int64_t cur_seed = seed + b; LOG_INFO("generating image: %i/%i - seed %i", b + 1, batch_count, cur_seed); sd_ctx->sd->rng->manual_seed(cur_seed); - struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); + struct ggml_tensor *x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); ggml_tensor_set_f32_randn(x_t, sd_ctx->sd->rng); std::vector sigmas = sd_ctx->sd->denoiser->schedule->get_sigmas(sample_steps); - struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx, x_t, NULL, c, c_vector, uc, uc_vector, cfg_scale, sample_method, sigmas); + struct ggml_tensor *x_0 = sd_ctx->sd->sample(work_ctx, x_t, NULL, c, c_vector, uc, uc_vector, cfg_scale, + sample_method, sigmas); // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin"); // print_ggml_tensor(x_0); int64_t sampling_end = ggml_time_ms(); @@ -1252,13 +1602,14 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, sd_ctx->sd->diffusion_model.free_params_buffer(); } int64_t t3 = ggml_time_ms(); - LOG_INFO("generating %" PRId64 " latent images completed, taking %.2fs", final_latents.size(), (t3 - t1) * 1.0f / 1000); + LOG_INFO("generating %" PRId64 " latent images completed, taking %.2fs", final_latents.size(), + (t3 - t1) * 1.0f / 1000); LOG_INFO("decoding %zu latents", final_latents.size()); - std::vector decoded_images; // collect decoded images + std::vector decoded_images; // collect decoded images for (size_t i = 0; i < final_latents.size(); i++) { - t1 = ggml_time_ms(); - struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, final_latents[i] /* x_0 */); + t1 = ggml_time_ms(); + struct ggml_tensor *img = sd_ctx->sd->decode_first_stage(work_ctx, final_latents[i] /* x_0 */); // print_ggml_tensor(img); if (img != NULL) { decoded_images.push_back(img); @@ -1272,30 +1623,30 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, if (sd_ctx->sd->free_params_immediately && !sd_ctx->sd->use_tiny_autoencoder) { sd_ctx->sd->first_stage_model.free_params_buffer(); } - sd_image_t* result_images = (sd_image_t*)calloc(batch_count, sizeof(sd_image_t)); + sd_image_t *result_images = (sd_image_t *) calloc(batch_count, sizeof(sd_image_t)); if (result_images == NULL) { ggml_free(work_ctx); return NULL; } for (size_t i = 0; i < decoded_images.size(); i++) { - result_images[i].width = width; - result_images[i].height = height; + result_images[i].width = width; + result_images[i].height = height; result_images[i].channel = 3; - result_images[i].data = sd_tensor_to_image(decoded_images[i]); + result_images[i].data = sd_tensor_to_image(decoded_images[i]); } ggml_free(work_ctx); LOG_INFO( - "txt2img completed in %.2fs", - (t4 - t0) * 1.0f / 1000); + "txt2img completed in %.2fs", + (t4 - t0) * 1.0f / 1000); return result_images; } -sd_image_t* img2img(sd_ctx_t* sd_ctx, +sd_image_t *img2img(sd_ctx_t *sd_ctx, sd_image_t init_image, - const char* prompt_c_str, - const char* negative_prompt_c_str, + const char *prompt_c_str, + const char *negative_prompt_c_str, int clip_skip, float cfg_scale, int width, @@ -1314,7 +1665,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, LOG_INFO("img2img %dx%d", width, height); std::vector sigmas = sd_ctx->sd->denoiser->schedule->get_sigmas(sample_steps); - size_t t_enc = static_cast(sample_steps * strength); + size_t t_enc = static_cast(sample_steps * strength); LOG_INFO("target t_enc is %zu steps", t_enc); std::vector sigma_sched; sigma_sched.assign(sigmas.begin() + sample_steps - t_enc - 1, sigmas.end()); @@ -1323,26 +1674,26 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, params.mem_size = static_cast(10 * 1024) * 1024; // 10 MB params.mem_size += width * height * 3 * sizeof(float) * 2; params.mem_buffer = NULL; - params.no_alloc = false; + params.no_alloc = false; // LOG_DEBUG("mem_size %u ", params.mem_size); // draft context - struct ggml_context* work_ctx = ggml_init(params); + struct ggml_context *work_ctx = ggml_init(params); if (!work_ctx) { LOG_ERROR("ggml_init() failed"); return NULL; } if (seed < 0) { - seed = (int)time(NULL); + seed = (int) time(NULL); } sd_ctx->sd->rng->manual_seed(seed); // extract and remove lora - auto result_pair = extract_and_remove_lora(prompt); + auto result_pair = extract_and_remove_lora(prompt); std::unordered_map lora_f2m = result_pair.first; // lora_name -> multiplier - for (auto& kv : lora_f2m) { + for (auto &kv: lora_f2m) { LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second); } prompt = result_pair.second; @@ -1354,13 +1705,13 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, int64_t t1 = ggml_time_ms(); LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); - ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); + ggml_tensor *init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); sd_image_to_tensor(init_image.data, init_img); - t0 = ggml_time_ms(); - ggml_tensor* init_latent = NULL; + t0 = ggml_time_ms(); + ggml_tensor *init_latent = NULL; if (!sd_ctx->sd->use_tiny_autoencoder) { - ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img); - init_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); + ggml_tensor *moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img); + init_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); } else { init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img); } @@ -1368,19 +1719,20 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, t1 = ggml_time_ms(); LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); - auto cond_pair = sd_ctx->sd->get_learned_condition(work_ctx, prompt, clip_skip, width, height); - ggml_tensor* c = cond_pair.first; - ggml_tensor* c_vector = cond_pair.second; // [adm_in_channels, ] - struct ggml_tensor* uc = NULL; - struct ggml_tensor* uc_vector = NULL; + auto cond_pair = sd_ctx->sd->get_learned_condition(work_ctx, prompt, clip_skip, width, height); + ggml_tensor *c = cond_pair.first; + ggml_tensor *c_vector = cond_pair.second; // [adm_in_channels, ] + struct ggml_tensor *uc = NULL; + struct ggml_tensor *uc_vector = NULL; if (cfg_scale != 1.0) { bool force_zero_embeddings = false; if (sd_ctx->sd->version == VERSION_XL && negative_prompt.size() == 0) { force_zero_embeddings = true; } - auto uncond_pair = sd_ctx->sd->get_learned_condition(work_ctx, negative_prompt, clip_skip, width, height, force_zero_embeddings); - uc = uncond_pair.first; - uc_vector = uncond_pair.second; // [adm_in_channels, ] + auto uncond_pair = sd_ctx->sd->get_learned_condition(work_ctx, negative_prompt, clip_skip, width, height, + force_zero_embeddings); + uc = uncond_pair.first; + uc_vector = uncond_pair.second; // [adm_in_channels, ] } int64_t t2 = ggml_time_ms(); LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t2 - t1); @@ -1389,11 +1741,11 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, } sd_ctx->sd->rng->manual_seed(seed); - struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, init_latent); + struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, init_latent); ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng); LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]); - struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx, init_latent, noise, c, c_vector, uc, uc_vector, + struct ggml_tensor *x_0 = sd_ctx->sd->sample(work_ctx, init_latent, noise, c, c_vector, uc, uc_vector, cfg_scale, sample_method, sigma_sched); // struct ggml_tensor *x_0 = load_tensor_from_file(ctx, "samples_ddim.bin"); // print_ggml_tensor(x_0); @@ -1403,7 +1755,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, sd_ctx->sd->diffusion_model.free_params_buffer(); } - struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, x_0); + struct ggml_tensor *img = sd_ctx->sd->decode_first_stage(work_ctx, x_0); if (sd_ctx->sd->free_params_immediately && !sd_ctx->sd->use_tiny_autoencoder) { sd_ctx->sd->first_stage_model.free_params_buffer(); } @@ -1412,17 +1764,17 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, return NULL; } - sd_image_t* result_images = (sd_image_t*)calloc(1, sizeof(sd_image_t)); + sd_image_t *result_images = (sd_image_t *) calloc(1, sizeof(sd_image_t)); if (result_images == NULL) { ggml_free(work_ctx); return NULL; } for (size_t i = 0; i < 1; i++) { - result_images[i].width = width; - result_images[i].height = height; + result_images[i].width = width; + result_images[i].height = height; result_images[i].channel = 3; - result_images[i].data = sd_tensor_to_image(img); + result_images[i].data = sd_tensor_to_image(img); } ggml_free(work_ctx); diff --git a/stable-diffusion.h b/stable-diffusion.h index a18ee4a3..0d59dce4 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -54,8 +54,8 @@ enum schedule_t { // same as enum ggml_type enum sd_type_t { - SD_TYPE_F32 = 0, - SD_TYPE_F16 = 1, + SD_TYPE_F32 = 0, + SD_TYPE_F16 = 1, SD_TYPE_Q4_0 = 2, SD_TYPE_Q4_1 = 3, // SD_TYPE_Q4_2 = 4, support has been removed @@ -78,7 +78,7 @@ enum sd_type_t { SD_TYPE_COUNT, }; -SD_API const char* sd_type_name(enum sd_type_t type); +SD_API const char *sd_type_name(enum sd_type_t type); enum sd_log_level_t { SD_LOG_DEBUG, @@ -87,38 +87,36 @@ enum sd_log_level_t { SD_LOG_ERROR }; -typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data); +typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char *text, void *data); -SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data); +SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void *data); SD_API int32_t get_num_physical_cores(); -SD_API const char* sd_get_system_info(); +SD_API const char *sd_get_system_info(); typedef struct { uint32_t width; uint32_t height; uint32_t channel; - uint8_t* data; + uint8_t *data; } sd_image_t; typedef struct sd_ctx_t sd_ctx_t; -SD_API sd_ctx_t* new_sd_ctx(const char* model_path, - const char* vae_path, - const char* taesd_path, - const char* lora_model_dir, +SD_API sd_ctx_t *new_sd_ctx(int n_threads, bool vae_decode_only, - bool vae_tiling, bool free_params_immediately, - int n_threads, - enum sd_type_t wtype, + const char *lora_model_dir_c_str, enum rng_type_t rng_type, - enum schedule_t s); + bool vae_tiling, + enum sd_type_t wtype, + enum schedule_t s, + bool init_backend_immediately = true); -SD_API void free_sd_ctx(sd_ctx_t* sd_ctx); +SD_API void free_sd_ctx(sd_ctx_t *sd_ctx); -SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx, - const char* prompt, - const char* negative_prompt, +SD_API sd_image_t *txt2img(sd_ctx_t *sd_ctx, + const char *prompt, + const char *negative_prompt, int clip_skip, float cfg_scale, int width, @@ -128,10 +126,10 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx, int64_t seed, int batch_count); -SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, +SD_API sd_image_t *img2img(sd_ctx_t *sd_ctx, sd_image_t init_image, - const char* prompt, - const char* negative_prompt, + const char *prompt, + const char *negative_prompt, int clip_skip, float cfg_scale, int width, @@ -144,14 +142,46 @@ SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, typedef struct upscaler_ctx_t upscaler_ctx_t; -SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path, +SD_API upscaler_ctx_t *new_upscaler_ctx(const char *esrgan_path, int n_threads, enum sd_type_t wtype); -SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx); +SD_API void free_upscaler_ctx(upscaler_ctx_t *upscaler_ctx); + +SD_API sd_image_t upscale(upscaler_ctx_t *upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor); + +SD_API void init_backend(sd_ctx_t *sd_ctx); + +SD_API void set_options(sd_ctx_t *sd_ctx, + int n_threads, + bool vae_decode_only, + bool free_params_immediately, + const char *lora_model_dir, + rng_type_t rng_type, + bool vae_tiling, + sd_type_t wtype, + schedule_t schedule); + +SD_API bool load_clip_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix = "te."); + +SD_API void free_clip_params(sd_ctx_t *sd_ctx); + +SD_API bool load_unet_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix = "unet."); + +SD_API void free_unet_params(sd_ctx_t *sd_ctx); + +SD_API bool load_vae_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix = "vae."); + +SD_API void free_vae_params(sd_ctx_t *sd_ctx); + +SD_API bool load_taesd_from_file(sd_ctx_t *sd_ctx, const char *model_path); + +SD_API void free_taesd_params(sd_ctx_t *sd_ctx); + +SD_API bool load_diffusions_from_file(sd_ctx_t *sd_ctx, const char *model_path); -SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor); +SD_API void free_diffusions_params(sd_ctx_t *sd_ctx); -SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type); +SD_API bool convert(const char *input_path, const char *vae_path, const char *output_path, sd_type_t output_type); #ifdef __cplusplus } From c7d11b9ba031f6dcd69ea49a28f77d7529e4336e Mon Sep 17 00:00:00 2001 From: Cyberhan123 <255542417@qq.com> Date: Tue, 23 Jan 2024 19:08:49 +0800 Subject: [PATCH 2/8] format code --- clip.hpp | 11 +- esrgan.hpp | 2 +- examples/cli/main.cpp | 133 ++++---- model.cpp | 484 ++++++++++++++-------------- model.h | 3 +- stable-diffusion.cpp | 734 +++++++++++++++++++++--------------------- stable-diffusion.h | 78 ++--- unet.hpp | 18 +- 8 files changed, 720 insertions(+), 743 deletions(-) diff --git a/clip.hpp b/clip.hpp index a456fffc..742cce09 100644 --- a/clip.hpp +++ b/clip.hpp @@ -475,7 +475,6 @@ struct ResidualAttentionBlock { ln2_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); ln2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size); - } void map_by_name(std::map& tensors, const std::string prefix) { @@ -822,11 +821,11 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule { auto hidden_states2 = text_model2.forward(ctx0, input_ids2); // [N, n_token, hidden_size2] hidden_states2 = ggml_reshape_4d(ctx0, - hidden_states2, - hidden_states2->ne[0], - hidden_states2->ne[1], - hidden_states2->ne[2], - hidden_states2->ne[3]); + hidden_states2, + hidden_states2->ne[0], + hidden_states2->ne[1], + hidden_states2->ne[2], + hidden_states2->ne[3]); hidden_states2 = ggml_cont(ctx0, ggml_permute(ctx0, hidden_states2, 2, 0, 1, 3)); hidden_states = ggml_concat(ctx0, hidden_states, hidden_states2); // [N, n_token, hidden_size + hidden_size2] diff --git a/esrgan.hpp b/esrgan.hpp index 90194c0d..c86363f7 100644 --- a/esrgan.hpp +++ b/esrgan.hpp @@ -376,7 +376,7 @@ struct ESRGAN : public GGMLModule { struct ggml_cgraph* gf = ggml_new_graph(ctx0); struct ggml_tensor* x_ = NULL; - float out_scale = 0.2f; + float out_scale = 0.2f; // it's performing a compute, check if backend isn't cpu if (!ggml_backend_is_cpu(backend)) { diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index bde19f34..b08340b3 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -17,34 +17,34 @@ #include "stb_image_write.h" -const char *rng_type_to_str[] = { - "std_default", - "cuda", +const char* rng_type_to_str[] = { + "std_default", + "cuda", }; // Names of the sampler method, same order as enum sample_method in stable-diffusion.h -const char *sample_method_str[] = { - "euler_a", - "euler", - "heun", - "dpm2", - "dpm++2s_a", - "dpm++2m", - "dpm++2mv2", - "lcm", +const char* sample_method_str[] = { + "euler_a", + "euler", + "heun", + "dpm2", + "dpm++2s_a", + "dpm++2m", + "dpm++2mv2", + "lcm", }; // Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h -const char *schedule_str[] = { - "default", - "discrete", - "karras", +const char* schedule_str[] = { + "default", + "discrete", + "karras", }; -const char *modes_str[] = { - "txt2img", - "img2img", - "convert", +const char* modes_str[] = { + "txt2img", + "img2img", + "convert", }; enum SDMode { @@ -56,7 +56,7 @@ enum SDMode { struct SDParams { int n_threads = -1; - SDMode mode = TXT2IMG; + SDMode mode = TXT2IMG; std::string model_path; std::string vae_path; @@ -70,22 +70,22 @@ struct SDParams { std::string prompt; std::string negative_prompt; float cfg_scale = 7.0f; - int clip_skip = -1; // <= 0 represents unspecified - int width = 512; - int height = 512; + int clip_skip = -1; // <= 0 represents unspecified + int width = 512; + int height = 512; int batch_count = 1; sample_method_t sample_method = EULER_A; - schedule_t schedule = DEFAULT; - int sample_steps = 20; - float strength = 0.75f; - rng_type_t rng_type = CUDA_RNG; - int64_t seed = 42; - bool verbose = false; - bool vae_tiling = false; + schedule_t schedule = DEFAULT; + int sample_steps = 20; + float strength = 0.75f; + rng_type_t rng_type = CUDA_RNG; + int64_t seed = 42; + bool verbose = false; + bool vae_tiling = false; }; -static std::string sd_basename(const std::string &path) { +static std::string sd_basename(const std::string& path) { size_t pos = path.find_last_of('/'); if (pos != std::string::npos) { return path.substr(pos + 1); @@ -124,7 +124,7 @@ void print_params(SDParams params) { printf(" vae_tiling: %s\n", params.vae_tiling ? "true" : "false"); } -void print_usage(int argc, const char *argv[]) { +void print_usage(int argc, const char* argv[]) { printf("usage: %s [arguments]\n", argv[0]); printf("\n"); printf("arguments:\n"); @@ -161,7 +161,7 @@ void print_usage(int argc, const char *argv[]) { printf(" -v, --verbose print extra info\n"); } -void parse_args(int argc, const char **argv, SDParams ¶ms) { +void parse_args(int argc, const char** argv, SDParams& params) { bool invalid_arg = false; std::string arg; for (int i = 1; i < argc; i++) { @@ -178,8 +178,8 @@ void parse_args(int argc, const char **argv, SDParams ¶ms) { invalid_arg = true; break; } - const char *mode_selected = argv[i]; - int mode_found = -1; + const char* mode_selected = argv[i]; + int mode_found = -1; for (int d = 0; d < MODE_COUNT; d++) { if (!strcmp(mode_selected, modes_str[d])) { mode_found = d; @@ -190,7 +190,7 @@ void parse_args(int argc, const char **argv, SDParams ¶ms) { mode_selected); exit(1); } - params.mode = (SDMode) mode_found; + params.mode = (SDMode)mode_found; } else if (arg == "-m" || arg == "--model") { if (++i >= argc) { invalid_arg = true; @@ -334,8 +334,8 @@ void parse_args(int argc, const char **argv, SDParams ¶ms) { invalid_arg = true; break; } - const char *schedule_selected = argv[i]; - int schedule_found = -1; + const char* schedule_selected = argv[i]; + int schedule_found = -1; for (int d = 0; d < N_SCHEDULES; d++) { if (!strcmp(schedule_selected, schedule_str[d])) { schedule_found = d; @@ -345,7 +345,7 @@ void parse_args(int argc, const char **argv, SDParams ¶ms) { invalid_arg = true; break; } - params.schedule = (schedule_t) schedule_found; + params.schedule = (schedule_t)schedule_found; } else if (arg == "-s" || arg == "--seed") { if (++i >= argc) { invalid_arg = true; @@ -357,8 +357,8 @@ void parse_args(int argc, const char **argv, SDParams ¶ms) { invalid_arg = true; break; } - const char *sample_method_selected = argv[i]; - int sample_method_found = -1; + const char* sample_method_selected = argv[i]; + int sample_method_found = -1; for (int m = 0; m < N_SAMPLE_METHODS; m++) { if (!strcmp(sample_method_selected, sample_method_str[m])) { sample_method_found = m; @@ -368,7 +368,7 @@ void parse_args(int argc, const char **argv, SDParams ¶ms) { invalid_arg = true; break; } - params.sample_method = (sample_method_t) sample_method_found; + params.sample_method = (sample_method_t)sample_method_found; } else if (arg == "-h" || arg == "--help") { print_usage(argc, argv); exit(0); @@ -434,7 +434,7 @@ void parse_args(int argc, const char **argv, SDParams ¶ms) { } if (params.seed < 0) { - srand((int) time(NULL)); + srand((int)time(NULL)); params.seed = rand(); } @@ -465,8 +465,8 @@ std::string get_image_params(SDParams params, int64_t seed) { return parameter_string; } -void sd_log_cb(enum sd_log_level_t level, const char *log, void *data) { - SDParams *params = (SDParams *) data; +void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) { + SDParams* params = (SDParams*)data; if (!params->verbose && level <= SD_LOG_DEBUG) { return; } @@ -479,11 +479,11 @@ void sd_log_cb(enum sd_log_level_t level, const char *log, void *data) { } } -int main(int argc, const char *argv[]) { +int main(int argc, const char* argv[]) { SDParams params; parse_args(argc, argv, params); - sd_set_log_callback(sd_log_cb, (void *) ¶ms); + sd_set_log_callback(sd_log_cb, (void*)¶ms); if (params.verbose) { print_params(params); @@ -511,12 +511,12 @@ int main(int argc, const char *argv[]) { } } - bool vae_decode_only = true; - uint8_t *input_image_buffer = NULL; + bool vae_decode_only = true; + uint8_t* input_image_buffer = NULL; if (params.mode == IMG2IMG) { vae_decode_only = false; - int c = 0; + int c = 0; input_image_buffer = stbi_load(params.input_path.c_str(), ¶ms.width, ¶ms.height, &c, 3); if (input_image_buffer == NULL) { fprintf(stderr, "load image from '%s' failed\n", params.input_path.c_str()); @@ -539,17 +539,16 @@ int main(int argc, const char *argv[]) { } } - sd_ctx_t *sd_ctx = new_sd_ctx( - params.n_threads, - vae_decode_only, - true, - params.lora_model_dir.c_str(), - params.rng_type, - params.vae_tiling, - params.wtype, - params.schedule, - true - ); + sd_ctx_t* sd_ctx = new_sd_ctx( + params.n_threads, + vae_decode_only, + true, + params.lora_model_dir.c_str(), + params.rng_type, + params.vae_tiling, + params.wtype, + params.schedule, + true); if (sd_ctx == NULL) { printf("new_sd_ctx_t failed\n"); @@ -577,7 +576,7 @@ int main(int argc, const char *argv[]) { } } - sd_image_t *results; + sd_image_t* results; if (params.mode == TXT2IMG) { results = txt2img(sd_ctx, params.prompt.c_str(), @@ -591,8 +590,8 @@ int main(int argc, const char *argv[]) { params.seed, params.batch_count); } else { - sd_image_t input_image = {(uint32_t) params.width, - (uint32_t) params.height, + sd_image_t input_image = {(uint32_t)params.width, + (uint32_t)params.height, 3, input_image_buffer}; @@ -619,7 +618,7 @@ int main(int argc, const char *argv[]) { int upscale_factor = 4; // unused for RealESRGAN_x4plus_anime_6B.pth if (params.esrgan_path.size() > 0) { - upscaler_ctx_t *upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(), + upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(), params.n_threads, params.wtype); @@ -641,7 +640,7 @@ int main(int argc, const char *argv[]) { } } - size_t last = params.output_path.find_last_of("."); + size_t last = params.output_path.find_last_of("."); std::string dummy_name = last != std::string::npos ? params.output_path.substr(0, last) : params.output_path; for (int i = 0; i < params.batch_count; i++) { if (results[i].data == NULL) { diff --git a/model.cpp b/model.cpp index 60f4dda7..f4130e5c 100644 --- a/model.cpp +++ b/model.cpp @@ -23,7 +23,7 @@ #define ST_HEADER_SIZE_LEN 8 -uint64_t read_u64(uint8_t *buffer) { +uint64_t read_u64(uint8_t* buffer) { // little endian uint64_t value = 0; value |= static_cast(buffer[7]) << 56; @@ -37,7 +37,7 @@ uint64_t read_u64(uint8_t *buffer) { return value; } -int32_t read_int(uint8_t *buffer) { +int32_t read_int(uint8_t* buffer) { // little endian int value = 0; value |= buffer[3] << 24; @@ -47,7 +47,7 @@ int32_t read_int(uint8_t *buffer) { return value; } -uint16_t read_short(uint8_t *buffer) { +uint16_t read_short(uint8_t* buffer) { // little endian uint16_t value = 0; value |= buffer[1] << 8; @@ -58,44 +58,44 @@ uint16_t read_short(uint8_t *buffer) { /*================================================= Preprocess ==================================================*/ std::string self_attn_names[] = { - "self_attn.q_proj.weight", - "self_attn.k_proj.weight", - "self_attn.v_proj.weight", - "self_attn.q_proj.bias", - "self_attn.k_proj.bias", - "self_attn.v_proj.bias", + "self_attn.q_proj.weight", + "self_attn.k_proj.weight", + "self_attn.v_proj.weight", + "self_attn.q_proj.bias", + "self_attn.k_proj.bias", + "self_attn.v_proj.bias", }; -const char *unused_tensors[] = { - "betas", - "alphas_cumprod_prev", - "sqrt_alphas_cumprod", - "sqrt_one_minus_alphas_cumprod", - "log_one_minus_alphas_cumprod", - "sqrt_recip_alphas_cumprod", - "sqrt_recipm1_alphas_cumprod", - "posterior_variance", - "posterior_log_variance_clipped", - "posterior_mean_coef1", - "posterior_mean_coef2", - "cond_stage_model.transformer.text_model.embeddings.position_ids", - "cond_stage_model.model.logit_scale", - "cond_stage_model.model.text_projection", - "conditioner.embedders.0.transformer.text_model.embeddings.position_ids", - "conditioner.embedders.0.model.logit_scale", - "conditioner.embedders.1.model.logit_scale", - "model.diffusion_model.time_embedding.cond_proj.weight", - "unet.time_embedding.cond_proj.weight", - "model_ema.decay", - "model_ema.num_updates", - "model_ema.diffusion_model", - "control_model", - "embedding_manager", - "denoiser.sigmas", +const char* unused_tensors[] = { + "betas", + "alphas_cumprod_prev", + "sqrt_alphas_cumprod", + "sqrt_one_minus_alphas_cumprod", + "log_one_minus_alphas_cumprod", + "sqrt_recip_alphas_cumprod", + "sqrt_recipm1_alphas_cumprod", + "posterior_variance", + "posterior_log_variance_clipped", + "posterior_mean_coef1", + "posterior_mean_coef2", + "cond_stage_model.transformer.text_model.embeddings.position_ids", + "cond_stage_model.model.logit_scale", + "cond_stage_model.model.text_projection", + "conditioner.embedders.0.transformer.text_model.embeddings.position_ids", + "conditioner.embedders.0.model.logit_scale", + "conditioner.embedders.1.model.logit_scale", + "model.diffusion_model.time_embedding.cond_proj.weight", + "unet.time_embedding.cond_proj.weight", + "model_ema.decay", + "model_ema.num_updates", + "model_ema.diffusion_model", + "control_model", + "embedding_manager", + "denoiser.sigmas", }; bool is_unused_tensor(std::string name) { - for (int i = 0; i < sizeof(unused_tensors) / sizeof(const char *); i++) { + for (int i = 0; i < sizeof(unused_tensors) / sizeof(const char*); i++) { if (starts_with(name, unused_tensors[i])) { return true; } @@ -104,54 +104,54 @@ bool is_unused_tensor(std::string name) { } std::unordered_map open_clip_to_hf_clip_model = { - {"model.ln_final.bias", "transformer.text_model.final_layer_norm.bias"}, - {"model.ln_final.weight", "transformer.text_model.final_layer_norm.weight"}, - {"model.positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"}, - {"model.token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"}, - {"model.text_projection", "transformer.text_model.text_projection"}, + {"model.ln_final.bias", "transformer.text_model.final_layer_norm.bias"}, + {"model.ln_final.weight", "transformer.text_model.final_layer_norm.weight"}, + {"model.positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"}, + {"model.token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"}, + {"model.text_projection", "transformer.text_model.text_projection"}, }; std::unordered_map open_clip_to_hk_clip_resblock = { - {"attn.out_proj.bias", "self_attn.out_proj.bias"}, - {"attn.out_proj.weight", "self_attn.out_proj.weight"}, - {"ln_1.bias", "layer_norm1.bias"}, - {"ln_1.weight", "layer_norm1.weight"}, - {"ln_2.bias", "layer_norm2.bias"}, - {"ln_2.weight", "layer_norm2.weight"}, - {"mlp.c_fc.bias", "mlp.fc1.bias"}, - {"mlp.c_fc.weight", "mlp.fc1.weight"}, - {"mlp.c_proj.bias", "mlp.fc2.bias"}, - {"mlp.c_proj.weight", "mlp.fc2.weight"}, + {"attn.out_proj.bias", "self_attn.out_proj.bias"}, + {"attn.out_proj.weight", "self_attn.out_proj.weight"}, + {"ln_1.bias", "layer_norm1.bias"}, + {"ln_1.weight", "layer_norm1.weight"}, + {"ln_2.bias", "layer_norm2.bias"}, + {"ln_2.weight", "layer_norm2.weight"}, + {"mlp.c_fc.bias", "mlp.fc1.bias"}, + {"mlp.c_fc.weight", "mlp.fc1.weight"}, + {"mlp.c_proj.bias", "mlp.fc2.bias"}, + {"mlp.c_proj.weight", "mlp.fc2.weight"}, }; std::unordered_map vae_decoder_name_map = { - {"first_stage_model.decoder.mid.attn_1.to_k.bias", "first_stage_model.decoder.mid.attn_1.k.bias"}, - {"first_stage_model.decoder.mid.attn_1.to_k.weight", "first_stage_model.decoder.mid.attn_1.k.weight"}, - {"first_stage_model.decoder.mid.attn_1.to_out.0.bias", "first_stage_model.decoder.mid.attn_1.proj_out.bias"}, - {"first_stage_model.decoder.mid.attn_1.to_out.0.weight", "first_stage_model.decoder.mid.attn_1.proj_out.weight"}, - {"first_stage_model.decoder.mid.attn_1.to_q.bias", "first_stage_model.decoder.mid.attn_1.q.bias"}, - {"first_stage_model.decoder.mid.attn_1.to_q.weight", "first_stage_model.decoder.mid.attn_1.q.weight"}, - {"first_stage_model.decoder.mid.attn_1.to_v.bias", "first_stage_model.decoder.mid.attn_1.v.bias"}, - {"first_stage_model.decoder.mid.attn_1.to_v.weight", "first_stage_model.decoder.mid.attn_1.v.weight"}, + {"first_stage_model.decoder.mid.attn_1.to_k.bias", "first_stage_model.decoder.mid.attn_1.k.bias"}, + {"first_stage_model.decoder.mid.attn_1.to_k.weight", "first_stage_model.decoder.mid.attn_1.k.weight"}, + {"first_stage_model.decoder.mid.attn_1.to_out.0.bias", "first_stage_model.decoder.mid.attn_1.proj_out.bias"}, + {"first_stage_model.decoder.mid.attn_1.to_out.0.weight", "first_stage_model.decoder.mid.attn_1.proj_out.weight"}, + {"first_stage_model.decoder.mid.attn_1.to_q.bias", "first_stage_model.decoder.mid.attn_1.q.bias"}, + {"first_stage_model.decoder.mid.attn_1.to_q.weight", "first_stage_model.decoder.mid.attn_1.q.weight"}, + {"first_stage_model.decoder.mid.attn_1.to_v.bias", "first_stage_model.decoder.mid.attn_1.v.bias"}, + {"first_stage_model.decoder.mid.attn_1.to_v.weight", "first_stage_model.decoder.mid.attn_1.v.weight"}, }; -std::string convert_open_clip_to_hf_clip(const std::string &name) { +std::string convert_open_clip_to_hf_clip(const std::string& name) { std::string new_name = name; std::string prefix; if (starts_with(new_name, "conditioner.embedders.0.")) { - prefix = "cond_stage_model."; + prefix = "cond_stage_model."; new_name = new_name.substr(strlen("conditioner.embedders.0.")); } else if (starts_with(new_name, "conditioner.embedders.1.")) { - prefix = "cond_stage_model.1."; + prefix = "cond_stage_model.1."; new_name = new_name.substr(strlen("conditioner.embedders.0.")); } else if (starts_with(new_name, "cond_stage_model.")) { - prefix = "cond_stage_model."; + prefix = "cond_stage_model."; new_name = new_name.substr(strlen("cond_stage_model.")); } else { return new_name; } std::string open_clip_resblock_prefix = "model.transformer.resblocks."; - std::string hf_clip_resblock_prefix = "transformer.text_model.encoder.layers."; + std::string hf_clip_resblock_prefix = "transformer.text_model.encoder.layers."; if (open_clip_to_hf_clip_model.find(new_name) != open_clip_to_hf_clip_model.end()) { new_name = open_clip_to_hf_clip_model[new_name]; @@ -159,21 +159,21 @@ std::string convert_open_clip_to_hf_clip(const std::string &name) { if (new_name.find(open_clip_resblock_prefix) == 0) { std::string remain = new_name.substr(open_clip_resblock_prefix.length()); - std::string idx = remain.substr(0, remain.find(".")); + std::string idx = remain.substr(0, remain.find(".")); std::string suffix = remain.substr(idx.length() + 1); if (suffix == "attn.in_proj_weight" || suffix == "attn.in_proj_bias") { new_name = hf_clip_resblock_prefix + idx + "." + suffix; } else if (open_clip_to_hk_clip_resblock.find(suffix) != open_clip_to_hk_clip_resblock.end()) { std::string new_suffix = open_clip_to_hk_clip_resblock[suffix]; - new_name = hf_clip_resblock_prefix + idx + "." + new_suffix; + new_name = hf_clip_resblock_prefix + idx + "." + new_suffix; } } return prefix + new_name; } -std::string convert_vae_decoder_name(const std::string &name) { +std::string convert_vae_decoder_name(const std::string& name) { if (vae_decoder_name_map.find(name) != vae_decoder_name_map.end()) { return vae_decoder_name_map[name]; } @@ -181,57 +181,57 @@ std::string convert_vae_decoder_name(const std::string &name) { } std::unordered_map> suffix_conversion_underline = { + { + "attentions", { - "attentions", - { - {"to_k", "k"}, - {"to_q", "q"}, - {"to_v", "v"}, - {"to_out_0", "proj_out"}, - {"group_norm", "norm"}, - }, + {"to_k", "k"}, + {"to_q", "q"}, + {"to_v", "v"}, + {"to_out_0", "proj_out"}, + {"group_norm", "norm"}, }, + }, + { + "resnets", { - "resnets", - { - {"conv1", "in_layers_2"}, - {"conv2", "out_layers_3"}, - {"norm1", "in_layers_0"}, - {"norm2", "out_layers_0"}, - {"time_emb_proj", "emb_layers_1"}, - {"conv_shortcut", "skip_connection"}, - }, + {"conv1", "in_layers_2"}, + {"conv2", "out_layers_3"}, + {"norm1", "in_layers_0"}, + {"norm2", "out_layers_0"}, + {"time_emb_proj", "emb_layers_1"}, + {"conv_shortcut", "skip_connection"}, }, + }, }; std::unordered_map> suffix_conversion_dot = { + { + "attentions", { - "attentions", - { - {"to_k", "k"}, - {"to_q", "q"}, - {"to_v", "v"}, - {"to_out.0", "proj_out"}, - {"group_norm", "norm"}, - }, + {"to_k", "k"}, + {"to_q", "q"}, + {"to_v", "v"}, + {"to_out.0", "proj_out"}, + {"group_norm", "norm"}, }, + }, + { + "resnets", { - "resnets", - { - {"conv1", "in_layers.2"}, - {"conv2", "out_layers.3"}, - {"norm1", "in_layers.0"}, - {"norm2", "out_layers.0"}, - {"time_emb_proj", "emb_layers.1"}, - {"conv_shortcut", "skip_connection"}, - }, + {"conv1", "in_layers.2"}, + {"conv2", "out_layers.3"}, + {"norm1", "in_layers.0"}, + {"norm2", "out_layers.0"}, + {"time_emb_proj", "emb_layers.1"}, + {"conv_shortcut", "skip_connection"}, }, + }, }; -std::string convert_diffusers_name_to_compvis(const std::string &key, char seq) { +std::string convert_diffusers_name_to_compvis(const std::string& key, char seq) { std::vector m; - auto match = [](std::vector &match_list, const std::regex ®ex, const std::string &key) { + auto match = [](std::vector& match_list, const std::regex& regex, const std::string& key) { auto r = std::smatch{}; if (!std::regex_match(key, r, regex)) { return false; @@ -251,7 +251,7 @@ std::string convert_diffusers_name_to_compvis(const std::string &key, char seq) suffix_conversion = suffix_conversion_dot; } - auto get_converted_suffix = [&suffix_conversion](const std::string &outer_key, const std::string &inner_key) { + auto get_converted_suffix = [&suffix_conversion](const std::string& outer_key, const std::string& inner_key) { auto outer_iter = suffix_conversion.find(outer_key); if (outer_iter != suffix_conversion.end()) { auto inner_iter = outer_iter->second.find(inner_key); @@ -280,8 +280,7 @@ std::string convert_diffusers_name_to_compvis(const std::string &key, char seq) m[1]; } - if (match(m, std::regex( - format("unet%cdown_blocks%c(\\d+)%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) { + if (match(m, std::regex(format("unet%cdown_blocks%c(\\d+)%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) { std::string suffix = get_converted_suffix(m[1], m[3]); // LOG_DEBUG("%s %s %s %s", m[0].c_str(), m[1].c_str(), m[2].c_str(), m[3].c_str()); return format("model%cdiffusion_model%cinput_blocks%c", seq, seq, seq) + @@ -296,8 +295,7 @@ std::string convert_diffusers_name_to_compvis(const std::string &key, char seq) seq + suffix; } - if (match(m, std::regex( - format("unet%cup_blocks%c(\\d+)%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) { + if (match(m, std::regex(format("unet%cup_blocks%c(\\d+)%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) { std::string suffix = get_converted_suffix(m[1], m[3]); return format("model%cdiffusion_model%coutput_blocks%c", seq, seq, seq) + std::to_string(std::stoi(m[0]) * 3 + std::stoi(m[2])) + seq + @@ -338,10 +336,10 @@ std::string convert_diffusers_name_to_compvis(const std::string &key, char seq) std::string block_name; if (m[1] == "attentions") { block_name = "attn"; - suffix = get_converted_suffix(m[1], m[3]); + suffix = get_converted_suffix(m[1], m[3]); } else { block_name = "block"; - suffix = m[3]; + suffix = m[3]; } return format("first_stage_model%c%s%cmid%c%s_%d%c%s", seq, m[0].c_str(), seq, seq, block_name.c_str(), std::stoi(m[2]) + 1, seq, suffix.c_str()); @@ -389,7 +387,7 @@ std::string convert_diffusers_name_to_compvis(const std::string &key, char seq) return key; } -std::string convert_tensor_name(const std::string &name) { +std::string convert_tensor_name(const std::string& name) { std::string new_name; if (starts_with(name, "cond_stage_model.") || starts_with(name, "conditioner.embedders.")) { new_name = convert_open_clip_to_hf_clip(name); @@ -399,7 +397,7 @@ std::string convert_tensor_name(const std::string &name) { size_t pos = name.find('.'); if (pos != std::string::npos) { std::string name_without_network_parts = name.substr(5, pos - 5); - std::string network_part = name.substr(pos + 1); + std::string network_part = name.substr(pos + 1); // LOG_DEBUG("%s %s", name_without_network_parts.c_str(), network_part.c_str()); std::string new_key = convert_diffusers_name_to_compvis(name_without_network_parts, '_'); if (new_key.empty()) { @@ -414,7 +412,7 @@ std::string convert_tensor_name(const std::string &name) { size_t pos = name.find_last_of('.'); if (pos != std::string::npos) { std::string name_without_network_parts = name.substr(0, pos); - std::string network_part = name.substr(pos + 1); + std::string network_part = name.substr(pos + 1); // LOG_DEBUG("%s %s", name_without_network_parts.c_str(), network_part.c_str()); std::string new_key = convert_diffusers_name_to_compvis(name_without_network_parts, '.'); if (new_key.empty()) { @@ -435,7 +433,7 @@ std::string convert_tensor_name(const std::string &name) { } void preprocess_tensor(TensorStorage tensor_storage, - std::vector &processed_tensor_storages) { + std::vector& processed_tensor_storages) { std::vector result; std::string new_name = convert_tensor_name(tensor_storage.name); @@ -458,9 +456,9 @@ void preprocess_tensor(TensorStorage tensor_storage, std::string prefix = new_name.substr(0, prefix_size); std::vector chunks = tensor_storage.chunk(3); - chunks[0].name = prefix + "self_attn.q_proj.weight"; - chunks[1].name = prefix + "self_attn.k_proj.weight"; - chunks[2].name = prefix + "self_attn.v_proj.weight"; + chunks[0].name = prefix + "self_attn.q_proj.weight"; + chunks[1].name = prefix + "self_attn.k_proj.weight"; + chunks[2].name = prefix + "self_attn.v_proj.weight"; processed_tensor_storages.insert(processed_tensor_storages.end(), chunks.begin(), chunks.end()); @@ -470,9 +468,9 @@ void preprocess_tensor(TensorStorage tensor_storage, std::string prefix = new_name.substr(0, prefix_size); std::vector chunks = tensor_storage.chunk(3); - chunks[0].name = prefix + "self_attn.q_proj.bias"; - chunks[1].name = prefix + "self_attn.k_proj.bias"; - chunks[2].name = prefix + "self_attn.v_proj.bias"; + chunks[0].name = prefix + "self_attn.q_proj.bias"; + chunks[1].name = prefix + "self_attn.k_proj.bias"; + chunks[2].name = prefix + "self_attn.v_proj.bias"; processed_tensor_storages.insert(processed_tensor_storages.end(), chunks.begin(), chunks.end()); } else { @@ -482,38 +480,38 @@ void preprocess_tensor(TensorStorage tensor_storage, float bf16_to_f32(uint16_t bfloat16) { uint32_t val_bits = (static_cast(bfloat16) << 16); - return *reinterpret_cast(&val_bits); + return *reinterpret_cast(&val_bits); } -void bf16_to_f32_vec(uint16_t *src, float *dst, int64_t n) { +void bf16_to_f32_vec(uint16_t* src, float* dst, int64_t n) { // support inplace op for (int64_t i = n - 1; i >= 0; i--) { dst[i] = bf16_to_f32(src[i]); } } -void convert_tensor(void *src, ggml_type src_type, void *dst, ggml_type dst_type, int n) { +void convert_tensor(void* src, ggml_type src_type, void* dst, ggml_type dst_type, int n) { if (src_type == dst_type) { size_t nbytes = n * ggml_type_size(src_type) / ggml_blck_size(src_type); - memcpy(((char *) dst), ((char *) src), nbytes); + memcpy(((char*)dst), ((char*)src), nbytes); } else if (src_type == GGML_TYPE_F32) { if (dst_type == GGML_TYPE_F16) { - ggml_fp32_to_fp16_row((float *) src, (ggml_fp16_t *) dst, n); + ggml_fp32_to_fp16_row((float*)src, (ggml_fp16_t*)dst, n); } else { int64_t hist[16]; - ggml_quantize_chunk(dst_type, (float *) src, dst, 0, n, hist); + ggml_quantize_chunk(dst_type, (float*)src, dst, 0, n, hist); } } else if (dst_type == GGML_TYPE_F32) { if (src_type == GGML_TYPE_F16) { - ggml_fp16_to_fp32_row((ggml_fp16_t *) src, (float *) dst, n); + ggml_fp16_to_fp32_row((ggml_fp16_t*)src, (float*)dst, n); } else { auto qtype = ggml_internal_get_type_traits(src_type); if (qtype.to_float == NULL) { throw std::runtime_error( - format("type %s unsupported for integer quantization: no dequantization available", - ggml_type_name(src_type))); + format("type %s unsupported for integer quantization: no dequantization available", + ggml_type_name(src_type))); } - qtype.to_float(src, (float *) dst, n); + qtype.to_float(src, (float*)dst, n); } } else { // src_type == GGML_TYPE_F16 => dst_type is quantized @@ -525,13 +523,13 @@ void convert_tensor(void *src, ggml_type src_type, void *dst, ggml_type dst_type } std::vector buf; buf.resize(sizeof(float) * n); - char *src_data_f32 = buf.data(); - qtype.to_float(src, (float *) src_data_f32, n); + char* src_data_f32 = buf.data(); + qtype.to_float(src, (float*)src_data_f32, n); if (dst_type == GGML_TYPE_F16) { - ggml_fp32_to_fp16_row((float *) src_data_f32, (ggml_fp16_t *) dst, n); + ggml_fp32_to_fp16_row((float*)src_data_f32, (ggml_fp16_t*)dst, n); } else { int64_t hist[16]; - ggml_quantize_chunk(dst_type, (float *) src_data_f32, dst, 0, n, hist); + ggml_quantize_chunk(dst_type, (float*)src_data_f32, dst, 0, n, hist); } } } @@ -569,7 +567,7 @@ std::map unicode_to_byte() { // byte_decoder = {v: k for k, v in byte_encoder.items()} std::map byte_decoder; - for (const auto &entry: byte_to_unicode) { + for (const auto& entry : byte_to_unicode) { byte_decoder[entry.second] = entry.first; } @@ -578,8 +576,8 @@ std::map unicode_to_byte() { return byte_decoder; } -bool is_zip_file(const std::string &file_path) { - struct zip_t *zip = zip_open(file_path.c_str(), 0, 'r'); +bool is_zip_file(const std::string& file_path) { + struct zip_t* zip = zip_open(file_path.c_str(), 0, 'r'); if (zip == NULL) { return false; } @@ -587,7 +585,7 @@ bool is_zip_file(const std::string &file_path) { return true; } -bool is_gguf_file(const std::string &file_path) { +bool is_gguf_file(const std::string& file_path) { std::ifstream file(file_path, std::ios::binary); if (!file.is_open()) { return false; @@ -608,7 +606,7 @@ bool is_gguf_file(const std::string &file_path) { return true; } -bool is_safetensors_file(const std::string &file_path) { +bool is_safetensors_file(const std::string& file_path) { std::ifstream file(file_path, std::ios::binary); if (!file.is_open()) { return false; @@ -625,7 +623,7 @@ bool is_safetensors_file(const std::string &file_path) { } uint8_t header_size_buf[ST_HEADER_SIZE_LEN]; - file.read((char *) header_size_buf, ST_HEADER_SIZE_LEN); + file.read((char*)header_size_buf, ST_HEADER_SIZE_LEN); if (!file) { return false; } @@ -650,7 +648,7 @@ bool is_safetensors_file(const std::string &file_path) { return true; } -bool ModelLoader::init_from_file(const std::string &file_path, const std::string &prefix) { +bool ModelLoader::init_from_file(const std::string& file_path, const std::string& prefix) { if (is_directory(file_path)) { LOG_INFO("load %s using diffusers format", file_path.c_str()); return init_from_diffusers_file(file_path, prefix); @@ -671,14 +669,14 @@ bool ModelLoader::init_from_file(const std::string &file_path, const std::string /*================================================= GGUFModelLoader ==================================================*/ -bool ModelLoader::init_from_gguf_file(const std::string &file_path, const std::string &prefix) { +bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::string& prefix) { LOG_DEBUG("init from '%s'", file_path.c_str()); file_paths_.push_back(file_path); size_t file_index = file_paths_.size() - 1; - gguf_context *ctx_gguf_ = NULL; - ggml_context *ctx_meta_ = NULL; - ctx_gguf_ = gguf_init_from_file(file_path.c_str(), {true, &ctx_meta_}); + gguf_context* ctx_gguf_ = NULL; + ggml_context* ctx_meta_ = NULL; + ctx_gguf_ = gguf_init_from_file(file_path.c_str(), {true, &ctx_meta_}); if (!ctx_gguf_) { LOG_ERROR("failed to open '%s'", file_path.c_str()); return false; @@ -686,12 +684,12 @@ bool ModelLoader::init_from_gguf_file(const std::string &file_path, const std::s int n_tensors = gguf_get_n_tensors(ctx_gguf_); - size_t total_size = 0; + size_t total_size = 0; size_t data_offset = gguf_get_data_offset(ctx_gguf_); for (int i = 0; i < n_tensors; i++) { - std::string name = gguf_get_tensor_name(ctx_gguf_, i); - struct ggml_tensor *dummy = ggml_get_tensor(ctx_meta_, name.c_str()); - size_t offset = data_offset + gguf_get_tensor_offset(ctx_gguf_, i); + std::string name = gguf_get_tensor_name(ctx_gguf_, i); + struct ggml_tensor* dummy = ggml_get_tensor(ctx_meta_, name.c_str()); + size_t offset = data_offset + gguf_get_tensor_offset(ctx_gguf_, i); // LOG_DEBUG("%s", name.c_str()); @@ -710,7 +708,7 @@ bool ModelLoader::init_from_gguf_file(const std::string &file_path, const std::s /*================================================= SafeTensorsModelLoader ==================================================*/ -ggml_type str_to_ggml_type(const std::string &dtype) { +ggml_type str_to_ggml_type(const std::string& dtype) { ggml_type ttype = GGML_TYPE_COUNT; if (dtype == "F16") { ttype = GGML_TYPE_F16; @@ -723,7 +721,7 @@ ggml_type str_to_ggml_type(const std::string &dtype) { } // https://huggingface.co/docs/safetensors/index -bool ModelLoader::init_from_safetensors_file(const std::string &file_path, const std::string &prefix) { +bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const std::string& prefix) { LOG_DEBUG("init from '%s'", file_path.c_str()); file_paths_.push_back(file_path); size_t file_index = file_paths_.size() - 1; @@ -745,7 +743,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string &file_path, const } uint8_t header_size_buf[ST_HEADER_SIZE_LEN]; - file.read((char *) header_size_buf, ST_HEADER_SIZE_LEN); + file.read((char*)header_size_buf, ST_HEADER_SIZE_LEN); if (!file) { LOG_ERROR("read safetensors header size failed: '%s'", file_path.c_str()); return false; @@ -769,8 +767,8 @@ bool ModelLoader::init_from_safetensors_file(const std::string &file_path, const nlohmann::json header_ = nlohmann::json::parse(header_buf.data()); - for (auto &item: header_.items()) { - std::string name = item.key(); + for (auto& item : header_.items()) { + std::string name = item.key(); nlohmann::json tensor_info = item.value(); // LOG_DEBUG("%s %s\n", name.c_str(), tensor_info.dump().c_str()); @@ -782,11 +780,11 @@ bool ModelLoader::init_from_safetensors_file(const std::string &file_path, const continue; } - std::string dtype = tensor_info["dtype"]; + std::string dtype = tensor_info["dtype"]; nlohmann::json shape = tensor_info["shape"]; size_t begin = tensor_info["data_offsets"][0].get(); - size_t end = tensor_info["data_offsets"][1].get(); + size_t end = tensor_info["data_offsets"][1].get(); ggml_type type = str_to_ggml_type(dtype); if (type == GGML_TYPE_COUNT) { @@ -799,7 +797,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string &file_path, const return false; } - int n_dims = (int) shape.size(); + int n_dims = (int)shape.size(); int64_t ne[4] = {1, 1, 1, 1}; for (int i = 0; i < n_dims; i++) { ne[i] = shape[i].get(); @@ -827,9 +825,9 @@ bool ModelLoader::init_from_safetensors_file(const std::string &file_path, const /*================================================= DiffusersModelLoader ==================================================*/ -bool ModelLoader::init_from_diffusers_file(const std::string &file_path, const std::string &prefix) { +bool ModelLoader::init_from_diffusers_file(const std::string& file_path, const std::string& prefix) { std::string unet_path = path_join(file_path, "unet/diffusion_pytorch_model.safetensors"); - std::string vae_path = path_join(file_path, "vae/diffusion_pytorch_model.safetensors"); + std::string vae_path = path_join(file_path, "vae/diffusion_pytorch_model.safetensors"); std::string clip_path = path_join(file_path, "text_encoder/model.safetensors"); if (!init_from_safetensors_file(unet_path, "unet.")) { @@ -944,7 +942,7 @@ struct PickleTensorReader { CHECK_SIZE, READ_DIMENS }; - ReadPhase phase = READ_NAME; + ReadPhase phase = READ_NAME; size_t entry_size = 0; int32_t nelements = 0; @@ -957,14 +955,14 @@ struct PickleTensorReader { if (phase == CHECK_SIZE) { if (entry_size == value * ggml_type_size(tensor_storage.type)) { nelements = value; - phase = READ_DIMENS; + phase = READ_DIMENS; return true; } else { phase = READ_NAME; } } else if (phase == READ_DIMENS) { if (tensor_storage.n_dims + 1 > 4) { // too many dimens - phase = READ_NAME; + phase = READ_NAME; tensor_storage.n_dims = 0; } if (nelements % value == 0) { @@ -975,23 +973,23 @@ struct PickleTensorReader { return false; } - void read_global(const std::string &str) { + void read_global(const std::string& str) { if (str == "FloatStorage") { if (read_global_type) { - global_type = GGML_TYPE_F32; + global_type = GGML_TYPE_F32; read_global_type = false; } tensor_storage.type = GGML_TYPE_F32; } else if (str == "HalfStorage") { if (read_global_type) { - global_type = GGML_TYPE_F16; + global_type = GGML_TYPE_F16; read_global_type = false; } tensor_storage.type = GGML_TYPE_F16; } } - void read_string(const std::string &str, struct zip_t *zip, std::string dir) { + void read_string(const std::string& str, struct zip_t* zip, std::string dir) { if (str == "storage") { read_global_type = true; } else if (str != "state_dict") { @@ -1004,8 +1002,8 @@ struct PickleTensorReader { { std::string name = zip_entry_name(zip); if (name == entry_name) { - tensor_storage.index_in_zip = (int) i; - entry_size = zip_entry_size(zip); + tensor_storage.index_in_zip = (int)i; + entry_size = zip_entry_size(zip); zip_entry_close(zip); break; } @@ -1017,7 +1015,7 @@ struct PickleTensorReader { } if (!read_global_type && phase == READ_NAME) { tensor_storage.name = str; - phase = READ_DATA; + phase = READ_DATA; tensor_storage.type = global_type; } } @@ -1027,7 +1025,7 @@ struct PickleTensorReader { ggml_type PickleTensorReader::global_type = GGML_TYPE_F32; // all pickle_tensors data type bool PickleTensorReader::read_global_type = false; -int find_char(uint8_t *buffer, int len, char c) { +int find_char(uint8_t* buffer, int len, char c) { for (int pos = 0; pos < len; pos++) { if (buffer[pos] == c) { return pos; @@ -1038,13 +1036,13 @@ int find_char(uint8_t *buffer, int len, char c) { #define MAX_STRING_BUFFER 512 -bool ModelLoader::parse_data_pkl(uint8_t *buffer, +bool ModelLoader::parse_data_pkl(uint8_t* buffer, size_t buffer_size, - zip_t *zip, + zip_t* zip, std::string dir, size_t file_index, - const std::string &prefix) { - uint8_t *buffer_end = buffer + buffer_size; + const std::string& prefix) { + uint8_t* buffer_end = buffer + buffer_size; if (buffer[0] == 0x80) { // proto if (buffer[1] != 2) { LOG_ERROR("Unsupported protocol\n"); @@ -1088,8 +1086,7 @@ bool ModelLoader::parse_data_pkl(uint8_t *buffer, buffer++; } buffer++; - } - break; + } break; case 'M': // BININT2 = b'M' # push 2-byte unsigned int { uint16_t value = read_short(buffer); @@ -1097,8 +1094,7 @@ bool ModelLoader::parse_data_pkl(uint8_t *buffer, buffer++; } buffer += 2; - } - break; + } break; case 'J': // BININT = b'J' # push four-byte signed int { const int32_t value = read_int(buffer); @@ -1106,8 +1102,7 @@ bool ModelLoader::parse_data_pkl(uint8_t *buffer, buffer++; // skip tuple after read num_elements } buffer += 4; - } - break; + } break; case 'X': // BINUNICODE = b'X' # " " " ; counted UTF-8 string argument { const int32_t len = read_int(buffer); @@ -1119,8 +1114,7 @@ bool ModelLoader::parse_data_pkl(uint8_t *buffer, memcpy(string_buffer, buffer, len < MAX_STRING_BUFFER ? len : (MAX_STRING_BUFFER - 1)); buffer += len; reader.read_string(string_buffer, zip, dir); - } - break; + } break; case 0x8C: // SHORT_BINUNICODE = b'\x8c' # push short string; UTF-8 length < 256 bytes { const int8_t len = *buffer; @@ -1129,8 +1123,7 @@ bool ModelLoader::parse_data_pkl(uint8_t *buffer, memcpy(string_buffer, buffer, len); buffer += len; // printf("String: '%s'\n", string_buffer); - } - break; + } break; case 'c': // GLOBAL = b'c' # push self.find_class(modname, name); 2 string args { int len = find_char(buffer, MAX_STRING_BUFFER, '\n'); @@ -1142,15 +1135,14 @@ bool ModelLoader::parse_data_pkl(uint8_t *buffer, memcpy(string_buffer, buffer, len); buffer += len + 1; reader.read_global(string_buffer); - } - break; + } break; case 0x86: // TUPLE2 = b'\x86' # build 2-tuple from two topmost stack items case 0x85: // TUPLE1 = b'\x85' # build 1-tuple from stack top case 't': // TUPLE = b't' # build tuple from topmost stack items if (reader.phase == PickleTensorReader::READ_DIMENS) { reader.tensor_storage.reverse_ne(); reader.tensor_storage.file_index = file_index; - reader.tensor_storage.name = prefix + reader.tensor_storage.name; + reader.tensor_storage.name = prefix + reader.tensor_storage.name; tensor_storages.push_back(reader.tensor_storage); // LOG_DEBUG("%s", reader.tensor_storage.name.c_str()); // reset @@ -1168,31 +1160,31 @@ bool ModelLoader::parse_data_pkl(uint8_t *buffer, return true; } -bool ModelLoader::init_from_ckpt_file(const std::string &file_path, const std::string &prefix) { +bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::string& prefix) { LOG_DEBUG("init from '%s'", file_path.c_str()); file_paths_.push_back(file_path); size_t file_index = file_paths_.size() - 1; - struct zip_t *zip = zip_open(file_path.c_str(), 0, 'r'); + struct zip_t* zip = zip_open(file_path.c_str(), 0, 'r'); if (zip == NULL) { LOG_ERROR("failed to open '%s'", file_path.c_str()); return false; } - int n = (int) zip_entries_total(zip); + int n = (int)zip_entries_total(zip); for (int i = 0; i < n; ++i) { zip_entry_openbyindex(zip, i); { std::string name = zip_entry_name(zip); - size_t pos = name.find("data.pkl"); + size_t pos = name.find("data.pkl"); if (pos != std::string::npos) { std::string dir = name.substr(0, pos); - void *pkl_data = NULL; + void* pkl_data = NULL; size_t pkl_size; zip_entry_read(zip, &pkl_data, &pkl_size); // LOG_DEBUG("%lld", pkl_size); - parse_data_pkl((uint8_t *) pkl_data, pkl_size, zip, dir, file_index, prefix); + parse_data_pkl((uint8_t*)pkl_data, pkl_size, zip, dir, file_index, prefix); free(pkl_data); } @@ -1206,7 +1198,7 @@ bool ModelLoader::init_from_ckpt_file(const std::string &file_path, const std::s SDVersion ModelLoader::get_sd_version() { // return VERSION_1_x; TensorStorage token_embedding_weight; - for (auto &tensor_storage: tensor_storages) { + for (auto& tensor_storage : tensor_storages) { if (tensor_storage.name.find("conditioner.embedders.1") != std::string::npos) { return VERSION_XL; } @@ -1232,7 +1224,7 @@ SDVersion ModelLoader::get_sd_version() { } ggml_type ModelLoader::get_sd_wtype() { - for (auto &tensor_storage: tensor_storages) { + for (auto& tensor_storage : tensor_storages) { if (is_unused_tensor(tensor_storage.name)) { continue; } @@ -1246,16 +1238,16 @@ ggml_type ModelLoader::get_sd_wtype() { } std::string ModelLoader::load_merges() { - std::string merges_utf8_str(reinterpret_cast(merges_utf8_c_str), sizeof(merges_utf8_c_str)); + std::string merges_utf8_str(reinterpret_cast(merges_utf8_c_str), sizeof(merges_utf8_c_str)); return merges_utf8_str; } -void remove_duplicates(std::vector &vec) { +void remove_duplicates(std::vector& vec) { std::unordered_map name_to_index_map; for (size_t i = 0; i < vec.size(); ++i) { - const std::string ¤t_name = vec[i].name; - auto it = name_to_index_map.find(current_name); + const std::string& current_name = vec[i].name; + auto it = name_to_index_map.find(current_name); if (it != name_to_index_map.end()) { vec[it->second] = vec[i]; @@ -1269,7 +1261,7 @@ void remove_duplicates(std::vector &vec) { bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend) { std::vector processed_tensor_storages; - for (auto &tensor_storage: tensor_storages) { + for (auto& tensor_storage : tensor_storages) { // LOG_DEBUG("%s", name.c_str()); if (is_unused_tensor(tensor_storage.name)) { @@ -1291,7 +1283,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend } bool is_zip = false; - for (auto &tensor_storage: tensor_storages) { + for (auto& tensor_storage : tensor_storages) { if (tensor_storage.file_index != file_index) { continue; } @@ -1301,7 +1293,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend } } - struct zip_t *zip = NULL; + struct zip_t* zip = NULL; if (is_zip) { zip = zip_open(file_path.c_str(), 0, 'r'); if (zip == NULL) { @@ -1313,16 +1305,16 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend std::vector read_buffer; std::vector convert_buffer; - auto read_data = [&](const TensorStorage &tensor_storage, char *buf, size_t n) { + auto read_data = [&](const TensorStorage& tensor_storage, char* buf, size_t n) { if (zip != NULL) { zip_entry_openbyindex(zip, tensor_storage.index_in_zip); size_t entry_size = zip_entry_size(zip); if (entry_size != n) { read_buffer.resize(entry_size); - zip_entry_noallocread(zip, (void *) read_buffer.data(), entry_size); - memcpy((void *) buf, (void *) (read_buffer.data() + tensor_storage.offset), n); + zip_entry_noallocread(zip, (void*)read_buffer.data(), entry_size); + memcpy((void*)buf, (void*)(read_buffer.data() + tensor_storage.offset), n); } else { - zip_entry_noallocread(zip, (void *) buf, n); + zip_entry_noallocread(zip, (void*)buf, n); } zip_entry_close(zip); } else { @@ -1336,13 +1328,13 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend return true; }; - for (auto &tensor_storage: processed_tensor_storages) { + for (auto& tensor_storage : processed_tensor_storages) { if (tensor_storage.file_index != file_index) { continue; } // LOG_DEBUG("%s", tensor_storage.name.c_str()); - ggml_tensor *dst_tensor = NULL; + ggml_tensor* dst_tensor = NULL; success = on_new_tensor_cb(tensor_storage, &dst_tensor); if (!success) { @@ -1360,37 +1352,37 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend #ifdef SD_USE_METAL || ggml_backend_is_metal(backend) #endif - ) { + ) { // for the CPU and Metal backend, we can copy directly into the tensor if (tensor_storage.type == dst_tensor->type) { GGML_ASSERT(ggml_nbytes(dst_tensor) == tensor_storage.nbytes()); - read_data(tensor_storage, (char *) dst_tensor->data, nbytes_to_read); + read_data(tensor_storage, (char*)dst_tensor->data, nbytes_to_read); if (tensor_storage.is_bf16) { // inplace op - bf16_to_f32_vec((uint16_t *) dst_tensor->data, (float *) dst_tensor->data, + bf16_to_f32_vec((uint16_t*)dst_tensor->data, (float*)dst_tensor->data, tensor_storage.nelements()); } } else { read_buffer.resize(tensor_storage.nbytes()); - read_data(tensor_storage, (char *) read_buffer.data(), nbytes_to_read); + read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read); if (tensor_storage.is_bf16) { // inplace op - bf16_to_f32_vec((uint16_t *) read_buffer.data(), (float *) read_buffer.data(), + bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); } - convert_tensor((void *) read_buffer.data(), tensor_storage.type, dst_tensor->data, - dst_tensor->type, (int) tensor_storage.nelements()); + convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data, + dst_tensor->type, (int)tensor_storage.nelements()); } } else { read_buffer.resize(tensor_storage.nbytes()); - read_data(tensor_storage, (char *) read_buffer.data(), nbytes_to_read); + read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read); if (tensor_storage.is_bf16) { // inplace op - bf16_to_f32_vec((uint16_t *) read_buffer.data(), (float *) read_buffer.data(), + bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); } @@ -1400,9 +1392,9 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend } else { // convert first, then copy to device memory convert_buffer.resize(ggml_nbytes(dst_tensor)); - convert_tensor((void *) read_buffer.data(), tensor_storage.type, - (void *) convert_buffer.data(), dst_tensor->type, - (int) tensor_storage.nelements()); + convert_tensor((void*)read_buffer.data(), tensor_storage.type, + (void*)convert_buffer.data(), dst_tensor->type, + (int)tensor_storage.nelements()); ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor)); } } @@ -1419,16 +1411,16 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend return success; } -bool ModelLoader::load_tensors(std::map &tensors, +bool ModelLoader::load_tensors(std::map& tensors, ggml_backend_t backend, std::set ignore_tensors, bool standalone) { std::set tensor_names_in_file; - auto on_new_tensor_cb = [&](const TensorStorage &tensor_storage, ggml_tensor **dst_tensor) -> bool { - const std::string &name = tensor_storage.name; + auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool { + const std::string& name = tensor_storage.name; tensor_names_in_file.insert(name); - struct ggml_tensor *real; + struct ggml_tensor* real; if (tensors.find(name) != tensors.end()) { real = tensors[name]; } else { @@ -1443,17 +1435,17 @@ bool ModelLoader::load_tensors(std::map &tens } if ( - real->ne[0] != tensor_storage.ne[0] || - real->ne[1] != tensor_storage.ne[1] || - real->ne[2] != tensor_storage.ne[2] || - real->ne[3] != tensor_storage.ne[3]) { + real->ne[0] != tensor_storage.ne[0] || + real->ne[1] != tensor_storage.ne[1] || + real->ne[2] != tensor_storage.ne[2] || + real->ne[3] != tensor_storage.ne[3]) { LOG_ERROR( - "tensor '%s' has wrong shape in model file: " - "got [%d, %d, %d, %d], expected [%d, %d, %d, %d]", - name.c_str(), - (int) tensor_storage.ne[0], (int) tensor_storage.ne[1], (int) tensor_storage.ne[2], - (int) tensor_storage.ne[3], - (int) real->ne[0], (int) real->ne[1], (int) real->ne[2], (int) real->ne[3]); + "tensor '%s' has wrong shape in model file: " + "got [%d, %d, %d, %d], expected [%d, %d, %d, %d]", + name.c_str(), + (int)tensor_storage.ne[0], (int)tensor_storage.ne[1], (int)tensor_storage.ne[2], + (int)tensor_storage.ne[3], + (int)real->ne[0], (int)real->ne[1], (int)real->ne[2], (int)real->ne[3]); return false; } @@ -1470,7 +1462,7 @@ bool ModelLoader::load_tensors(std::map &tens bool some_tensor_not_init = false; - for (auto pair: tensors) { + for (auto pair : tensors) { if (pair.first.find("cond_stage_model.transformer.text_model.encoder.layers.23") != std::string::npos) { continue; } @@ -1494,18 +1486,18 @@ bool ModelLoader::load_tensors(std::map &tens return true; } -bool ModelLoader::save_to_gguf_file(const std::string &file_path, ggml_type type) { - auto backend = ggml_backend_cpu_init(); +bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type) { + auto backend = ggml_backend_cpu_init(); size_t mem_size = 1 * 1024 * 1024; // for padding mem_size += tensor_storages.size() * ggml_tensor_overhead(); mem_size += cal_mem_size(backend, type); LOG_INFO("model tensors mem size: %.2fMB", mem_size / 1024.f / 1024.f); - ggml_context *ggml_ctx = ggml_init({mem_size, NULL, false}); + ggml_context* ggml_ctx = ggml_init({mem_size, NULL, false}); - gguf_context *gguf_ctx = gguf_init_empty(); + gguf_context* gguf_ctx = gguf_init_empty(); - auto on_new_tensor_cb = [&](const TensorStorage &tensor_storage, ggml_tensor **dst_tensor) -> bool { - const std::string &name = tensor_storage.name; + auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool { + const std::string& name = tensor_storage.name; ggml_type tensor_type = tensor_storage.type; if (type != GGML_TYPE_COUNT) { @@ -1516,7 +1508,7 @@ bool ModelLoader::save_to_gguf_file(const std::string &file_path, ggml_type type } } - ggml_tensor *tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne); + ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne); if (tensor == NULL) { LOG_ERROR("ggml_new_tensor failed"); return false; @@ -1555,14 +1547,14 @@ int64_t ModelLoader::cal_mem_size(ggml_backend_t backend, ggml_type type) { } int64_t mem_size = 0; std::vector processed_tensor_storages; - for (auto &tensor_storage: tensor_storages) { + for (auto& tensor_storage : tensor_storages) { if (is_unused_tensor(tensor_storage.name)) { continue; } preprocess_tensor(tensor_storage, processed_tensor_storages); } - for (auto &tensor_storage: processed_tensor_storages) { + for (auto& tensor_storage : processed_tensor_storages) { ggml_type tensor_type = tensor_storage.type; if (type != GGML_TYPE_COUNT) { if (ggml_is_quantized(type) && tensor_storage.ne[0] % 32 != 0) { @@ -1578,7 +1570,7 @@ int64_t ModelLoader::cal_mem_size(ggml_backend_t backend, ggml_type type) { return mem_size; } -bool convert(const char *input_path, const char *vae_path, const char *output_path, sd_type_t output_type) { +bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type) { ModelLoader model_loader; if (!model_loader.init_from_file(input_path)) { @@ -1592,6 +1584,6 @@ bool convert(const char *input_path, const char *vae_path, const char *output_pa return false; } } - bool success = model_loader.save_to_gguf_file(output_path, (ggml_type) output_type); + bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type); return success; } \ No newline at end of file diff --git a/model.h b/model.h index b0d61547..86f7649c 100644 --- a/model.h +++ b/model.h @@ -120,7 +120,8 @@ class ModelLoader { bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend); bool load_tensors(std::map& tensors, ggml_backend_t backend, - std::set ignore_tensors = {}, bool standalone=true); + std::set ignore_tensors = {}, + bool standalone = true); bool save_to_gguf_file(const std::string& file_path, ggml_type type); int64_t cal_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT); ~ModelLoader() = default; diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 3954e326..28f5d8c8 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -14,60 +14,59 @@ #include "unet.hpp" #include "vae.hpp" -const char *model_version_to_str[] = { - "1.x", - "2.x", - "XL", +const char* model_version_to_str[] = { + "1.x", + "2.x", + "XL", }; -const char *sampling_methods_str[] = { - "Euler A", - "Euler", - "Heun", - "DPM2", - "DPM++ (2s)", - "DPM++ (2M)", - "modified DPM++ (2M)", - "LCM", +const char* sampling_methods_str[] = { + "Euler A", + "Euler", + "Heun", + "DPM2", + "DPM++ (2s)", + "DPM++ (2M)", + "modified DPM++ (2M)", + "LCM", }; /*================================================== Helper Functions ================================================*/ -void calculate_alphas_cumprod(float *alphas_cumprod, +void calculate_alphas_cumprod(float* alphas_cumprod, float linear_start = 0.00085f, - float linear_end = 0.0120, - int timesteps = TIMESTEPS) { + float linear_end = 0.0120, + int timesteps = TIMESTEPS) { float ls_sqrt = sqrtf(linear_start); float le_sqrt = sqrtf(linear_end); - float amount = le_sqrt - ls_sqrt; + float amount = le_sqrt - ls_sqrt; float product = 1.0f; for (int i = 0; i < timesteps; i++) { - float beta = ls_sqrt + amount * ((float) i / (timesteps - 1)); + float beta = ls_sqrt + amount * ((float)i / (timesteps - 1)); product *= 1.0f - powf(beta, 2.0f); alphas_cumprod[i] = product; } } - /*=============================================== StableDiffusionGGML ================================================*/ class StableDiffusionGGML { public: SDVersion version; - bool vae_decode_only = false; + bool vae_decode_only = false; bool free_params_immediately = false; std::shared_ptr rng = std::make_shared(); - int n_threads = -1; - float scale_factor = 0.18215f; + int n_threads = -1; + float scale_factor = 0.18215f; FrozenCLIPEmbedderWithCustomWords cond_stage_model; UNetModel diffusion_model; AutoEncoderKL first_stage_model; bool use_tiny_autoencoder = false; - bool vae_tiling = false; + bool vae_tiling = false; - std::map tensors; + std::map tensors; std::string lora_model_dir; // lora_name => multiplier @@ -75,11 +74,11 @@ class StableDiffusionGGML { std::map loras; std::shared_ptr denoiser = std::make_shared(); - schedule_t schedule = DEFAULT; + schedule_t schedule = DEFAULT; - ggml_backend_t backend = NULL; // general backend - ggml_type model_data_type = GGML_TYPE_COUNT; // runtime weight type - ggml_type wtype = GGML_TYPE_COUNT; // options weight type + ggml_backend_t backend = NULL; // general backend + ggml_type model_data_type = GGML_TYPE_COUNT; // runtime weight type + ggml_type wtype = GGML_TYPE_COUNT; // options weight type TinyAutoEncoder tae_first_stage; std::string taesd_path; @@ -97,15 +96,15 @@ class StableDiffusionGGML { ggml_type wtype, schedule_t schedule, bool init_backend_immediately = true) - : n_threads(n_threads), - vae_decode_only(vae_decode_only), - free_params_immediately(free_params_immediately), - lora_model_dir(lora_model_dir), - vae_tiling(vae_tiling), - wtype(wtype), - schedule(schedule) { + : n_threads(n_threads), + vae_decode_only(vae_decode_only), + free_params_immediately(free_params_immediately), + lora_model_dir(lora_model_dir), + vae_tiling(vae_tiling), + wtype(wtype), + schedule(schedule) { first_stage_model.decode_only = vae_decode_only; - tae_first_stage.decode_only = vae_decode_only; + tae_first_stage.decode_only = vae_decode_only; if (rng_type == STD_DEFAULT_RNG) { rng = std::make_shared(); } else if (rng_type == CUDA_RNG) { @@ -151,24 +150,23 @@ class StableDiffusionGGML { rng_type_t rng_type, bool vae_tiling, sd_type_t wtype, - schedule_t schedule - ) { - this->n_threads = n_threads; - this->vae_decode_only = vae_decode_only; + schedule_t schedule) { + this->n_threads = n_threads; + this->vae_decode_only = vae_decode_only; this->free_params_immediately = free_params_immediately; - this->lora_model_dir = lora_model_dir; + this->lora_model_dir = lora_model_dir; if (rng_type == STD_DEFAULT_RNG) { rng = std::make_shared(); } else if (rng_type == CUDA_RNG) { rng = std::make_shared(); } this->vae_tiling = vae_tiling; - this->wtype = (ggml_type) wtype; - this->schedule = schedule; + this->wtype = (ggml_type)wtype; + this->schedule = schedule; apply_schedule(); } - bool load_clip_from_file(const std::string &model_path, bool standalone = true, const std::string &prefix = "te.") { + bool load_clip_from_file(const std::string& model_path, bool standalone = true, const std::string& prefix = "te.") { if (backend == NULL) { LOG_ERROR("if you set init_backend_immediately false, please call init_backend first"); return false; @@ -234,11 +232,11 @@ class StableDiffusionGGML { } struct ggml_init_params params; - params.mem_size = static_cast(3 * 1024) * 1024; // 10M + params.mem_size = static_cast(3 * 1024) * 1024; // 10M params.mem_buffer = NULL; - params.no_alloc = false; + params.no_alloc = false; // LOG_DEBUG("mem_size %u ", params.mem_size); - struct ggml_context *ctx = ggml_init(params); // for alphas_cumprod and is_using_v_parameterization check + struct ggml_context* ctx = ggml_init(params); // for alphas_cumprod and is_using_v_parameterization check if (!ctx) { LOG_ERROR("ggml_init() failed"); return false; @@ -248,10 +246,10 @@ class StableDiffusionGGML { LOG_DEBUG("loading clip weights"); int64_t t0 = ggml_time_ms(); - std::map tensors_need_to_load; + std::map tensors_need_to_load; std::set ignore_tensors; - for (auto &pair: tensors) { + for (auto& pair : tensors) { tensors_need_to_load.insert(pair); } @@ -275,9 +273,9 @@ class StableDiffusionGGML { } } - bool load_unet_from_file(const std::string &model_path, - bool standalone = true, - const std::string &prefix = "unet.") { + bool load_unet_from_file(const std::string& model_path, + bool standalone = true, + const std::string& prefix = "unet.") { if (backend == NULL) { LOG_ERROR("if you set init_backend_immediately false, please call init_backend first"); return false; @@ -310,11 +308,11 @@ class StableDiffusionGGML { } struct ggml_init_params params; - params.mem_size = static_cast(3 * 1024) * 1024; // 10M + params.mem_size = static_cast(3 * 1024) * 1024; // 10M params.mem_buffer = NULL; - params.no_alloc = false; + params.no_alloc = false; - struct ggml_context *ctx = ggml_init(params); // for alphas_cumprod and is_using_v_parameterization check + struct ggml_context* ctx = ggml_init(params); // for alphas_cumprod and is_using_v_parameterization check if (!ctx) { LOG_ERROR("ggml_init() failed"); @@ -325,13 +323,13 @@ class StableDiffusionGGML { LOG_DEBUG("loading weights"); int64_t t0 = ggml_time_ms(); - std::map tensors_need_to_load; + std::map tensors_need_to_load; std::set ignore_tensors; - ggml_tensor *alphas_cumprod_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, TIMESTEPS); - calculate_alphas_cumprod((float *) alphas_cumprod_tensor->data); + ggml_tensor* alphas_cumprod_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, TIMESTEPS); + calculate_alphas_cumprod((float*)alphas_cumprod_tensor->data); tensors_need_to_load["alphas_cumprod"] = alphas_cumprod_tensor; - for (auto &pair: tensors) { - const std::string &name = pair.first; + for (auto& pair : tensors) { + const std::string& name = pair.first; if (starts_with(name, "cond_stage_model.") || starts_with(name, "first_stage_model.")) { ignore_tensors.insert(name); continue; @@ -367,16 +365,15 @@ class StableDiffusionGGML { return true; } - void free_unet_params() { if (diffusion_model.params_buffer_size > 0) { diffusion_model.free_params_buffer(); } } - bool load_vae_from_file(const std::string &model_path, - bool standalone = true, - const std::string &prefix = "vae.") { + bool load_vae_from_file(const std::string& model_path, + bool standalone = true, + const std::string& prefix = "vae.") { if (backend == NULL) { LOG_ERROR("if you set init_backend_immediately false, please call init_backend first"); return false; @@ -413,11 +410,11 @@ class StableDiffusionGGML { } struct ggml_init_params params; - params.mem_size = static_cast(10 * 1024) * 1024; // 10M + params.mem_size = static_cast(10 * 1024) * 1024; // 10M params.mem_buffer = NULL; - params.no_alloc = false; + params.no_alloc = false; // LOG_DEBUG("mem_size %u ", params.mem_size); - struct ggml_context *ctx = ggml_init(params); // for alphas_cumprod and is_using_v_parameterization check + struct ggml_context* ctx = ggml_init(params); // for alphas_cumprod and is_using_v_parameterization check if (!ctx) { LOG_ERROR("ggml_init() failed"); return false; @@ -427,10 +424,10 @@ class StableDiffusionGGML { LOG_DEBUG("loading weights"); int64_t t0 = ggml_time_ms(); - std::map tensors_need_to_load; + std::map tensors_need_to_load; std::set ignore_tensors; - for (auto &pair: tensors) { - const std::string &name = pair.first; + for (auto& pair : tensors) { + const std::string& name = pair.first; // TODO: make it can reload in compute time. so we can set vae_decode_only dynamic. if (vae_decode_only && (starts_with(name, "first_stage_model.encoder") || starts_with(name, "first_stage_model.quant"))) { @@ -459,8 +456,8 @@ class StableDiffusionGGML { } } - //load the all model from one file - bool load_diffusions_from_file(const std::string &model_path) { + // load the all model from one file + bool load_diffusions_from_file(const std::string& model_path) { LOG_INFO("loading model from '%s'", model_path.c_str()); if (!load_clip_from_file(model_path, false, "")) { free_clip_params(); @@ -494,7 +491,7 @@ class StableDiffusionGGML { LOG_INFO("free vae params"); } - bool load_taesd_from_file(const std::string &taesd_path) { + bool load_taesd_from_file(const std::string& taesd_path) { if (first_stage_model.params_buffer_size > 0) { free_vae_params(); } @@ -511,34 +508,34 @@ class StableDiffusionGGML { } } - bool is_using_v_parameterization_for_sd2(ggml_context *work_ctx) { - struct ggml_tensor *x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1); + bool is_using_v_parameterization_for_sd2(ggml_context* work_ctx) { + struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1); ggml_set_f32(x_t, 0.5); - struct ggml_tensor *c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1); + struct ggml_tensor* c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1); ggml_set_f32(c, 0.5); - struct ggml_tensor *timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, - 1); // [N, ] - struct ggml_tensor *t_emb = new_timestep_embedding(work_ctx, NULL, timesteps, - diffusion_model.model_channels); // [N, model_channels] + struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, + 1); // [N, ] + struct ggml_tensor* t_emb = new_timestep_embedding(work_ctx, NULL, timesteps, + diffusion_model.model_channels); // [N, model_channels] int64_t t0 = ggml_time_ms(); ggml_set_f32(timesteps, 999); set_timestep_embedding(timesteps, t_emb, diffusion_model.model_channels); - struct ggml_tensor *out = ggml_dup_tensor(work_ctx, x_t); + struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t); diffusion_model.alloc_compute_buffer(x_t, c, t_emb); diffusion_model.compute(out, n_threads, x_t, NULL, c, t_emb); diffusion_model.free_compute_buffer(); double result = 0.f; { - float *vec_x = (float *) x_t->data; - float *vec_out = (float *) out->data; + float* vec_x = (float*)x_t->data; + float* vec_out = (float*)out->data; int64_t n = ggml_nelements(out); for (int i = 0; i < n; i++) { - result += ((double) vec_out[i] - (double) vec_x[i]); + result += ((double)vec_out[i] - (double)vec_x[i]); } result /= n; } @@ -571,15 +568,15 @@ class StableDiffusionGGML { for (int i = 0; i < TIMESTEPS; i++) { denoiser->schedule->alphas_cumprod[i] = alphas_cumprod_tensor[i]; - denoiser->schedule->sigmas[i] = std::sqrt( - (1 - denoiser->schedule->alphas_cumprod[i]) / denoiser->schedule->alphas_cumprod[i]); + denoiser->schedule->sigmas[i] = std::sqrt( + (1 - denoiser->schedule->alphas_cumprod[i]) / denoiser->schedule->alphas_cumprod[i]); denoiser->schedule->log_sigmas[i] = std::log(denoiser->schedule->sigmas[i]); } } - void apply_lora(const std::string &lora_name, float multiplier) { - int64_t t0 = ggml_time_ms(); - std::string st_file_path = path_join(lora_model_dir, lora_name + ".safetensors"); + void apply_lora(const std::string& lora_name, float multiplier) { + int64_t t0 = ggml_time_ms(); + std::string st_file_path = path_join(lora_model_dir, lora_name + ".safetensors"); std::string ckpt_file_path = path_join(lora_model_dir, lora_name + ".ckpt"); std::string file_path; if (file_exists(st_file_path)) { @@ -609,14 +606,14 @@ class StableDiffusionGGML { (t1 - t0) * 1.0f / 1000); } - void apply_loras(const std::unordered_map &lora_state) { + void apply_loras(const std::unordered_map& lora_state) { if (lora_state.size() > 0 && model_data_type != GGML_TYPE_F16 && model_data_type != GGML_TYPE_F32) { LOG_WARN("In quantized models when applying LoRA, the images have poor quality."); } std::unordered_map lora_state_diff; - for (auto &kv: lora_state) { - const std::string &lora_name = kv.first; - float multiplier = kv.second; + for (auto& kv : lora_state) { + const std::string& lora_name = kv.first; + float multiplier = kv.second; if (curr_lora_state.find(lora_name) != curr_lora_state.end()) { float curr_multiplier = curr_lora_state[lora_name]; @@ -629,35 +626,35 @@ class StableDiffusionGGML { } } - for (auto &kv: lora_state_diff) { + for (auto& kv : lora_state_diff) { apply_lora(kv.first, kv.second); } curr_lora_state = lora_state; } - std::pair get_learned_condition(ggml_context *work_ctx, - const std::string &text, - int clip_skip, - int width, - int height, - bool force_zero_embeddings = false) { + std::pair get_learned_condition(ggml_context* work_ctx, + const std::string& text, + int clip_skip, + int width, + int height, + bool force_zero_embeddings = false) { cond_stage_model.set_clip_skip(clip_skip); - auto tokens_and_weights = cond_stage_model.tokenize(text, true); - std::vector &tokens = tokens_and_weights.first; - std::vector &weights = tokens_and_weights.second; - int64_t t0 = ggml_time_ms(); - struct ggml_tensor *pooled = NULL; - size_t total_hidden_size = cond_stage_model.text_model.hidden_size; + auto tokens_and_weights = cond_stage_model.tokenize(text, true); + std::vector& tokens = tokens_and_weights.first; + std::vector& weights = tokens_and_weights.second; + int64_t t0 = ggml_time_ms(); + struct ggml_tensor* pooled = NULL; + size_t total_hidden_size = cond_stage_model.text_model.hidden_size; if (version == VERSION_XL) { total_hidden_size += cond_stage_model.text_model2.hidden_size; pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, cond_stage_model.text_model2.projection_dim); } - struct ggml_tensor *hidden_states = ggml_new_tensor_2d(work_ctx, + struct ggml_tensor* hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, total_hidden_size, cond_stage_model.text_model.max_position_embeddings); // [N, n_token, hidden_size] - cond_stage_model.alloc_compute_buffer(work_ctx, (int) tokens.size()); + cond_stage_model.alloc_compute_buffer(work_ctx, (int)tokens.size()); cond_stage_model.compute(n_threads, tokens, hidden_states, pooled); cond_stage_model.free_compute_buffer(); // if (pooled != NULL) { @@ -667,7 +664,7 @@ class StableDiffusionGGML { int64_t t1 = ggml_time_ms(); LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); - ggml_tensor *result = ggml_dup_tensor(work_ctx, hidden_states); + ggml_tensor* result = ggml_dup_tensor(work_ctx, hidden_states); { float original_mean = ggml_tensor_mean(hidden_states); for (int i2 = 0; i2 < hidden_states->ne[2]; i2++) { @@ -683,34 +680,34 @@ class StableDiffusionGGML { ggml_tensor_scale(result, (original_mean / new_mean)); } if (force_zero_embeddings) { - float *vec = (float *) result->data; + float* vec = (float*)result->data; for (int i = 0; i < ggml_nelements(result); i++) { vec[i] = 0; } } - ggml_tensor *vec = NULL; + ggml_tensor* vec = NULL; if (version == VERSION_XL) { int out_dim = 256; - vec = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model.adm_in_channels); + vec = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model.adm_in_channels); // [0:1280] size_t offset = 0; memcpy(vec->data, pooled->data, ggml_nbytes(pooled)); offset += ggml_nbytes(pooled); - struct ggml_tensor *timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 2); + struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 2); // original_size_as_tuple - float orig_width = (float) width; - float orig_height = (float) height; + float orig_width = (float)width; + float orig_height = (float)height; ggml_tensor_set_f32(timesteps, orig_height, 0); ggml_tensor_set_f32(timesteps, orig_width, 1); - ggml_tensor *embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, + ggml_tensor* embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset); offset += ggml_nbytes(embed_view); set_timestep_embedding(timesteps, embed_view, out_dim); // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2)); // crop_coords_top_left - float crop_coord_top = 0.f; + float crop_coord_top = 0.f; float crop_coord_left = 0.f; ggml_tensor_set_f32(timesteps, crop_coord_top, 0); ggml_tensor_set_f32(timesteps, crop_coord_left, 1); @@ -719,8 +716,8 @@ class StableDiffusionGGML { set_timestep_embedding(timesteps, embed_view, out_dim); // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2)); // target_size_as_tuple - float target_width = (float) width; - float target_height = (float) height; + float target_width = (float)width; + float target_height = (float)height; ggml_tensor_set_f32(timesteps, target_height, 0); ggml_tensor_set_f32(timesteps, target_width, 1); embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset); @@ -733,27 +730,27 @@ class StableDiffusionGGML { return {result, vec}; } - ggml_tensor *sample(ggml_context *work_ctx, - ggml_tensor *x_t, - ggml_tensor *noise, - ggml_tensor *c, - ggml_tensor *c_vector, - ggml_tensor *uc, - ggml_tensor *uc_vector, + ggml_tensor* sample(ggml_context* work_ctx, + ggml_tensor* x_t, + ggml_tensor* noise, + ggml_tensor* c, + ggml_tensor* c_vector, + ggml_tensor* uc, + ggml_tensor* uc_vector, float cfg_scale, sample_method_t method, - const std::vector &sigmas) { + const std::vector& sigmas) { size_t steps = sigmas.size() - 1; // x_t = load_tensor_from_file(work_ctx, "./rand0.bin"); // print_ggml_tensor(x_t); - struct ggml_tensor *x = ggml_dup_tensor(work_ctx, x_t); + struct ggml_tensor* x = ggml_dup_tensor(work_ctx, x_t); copy_ggml_tensor(x, x_t); - struct ggml_tensor *noised_input = ggml_dup_tensor(work_ctx, x_t); - struct ggml_tensor *timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, - 1); // [N, ] - struct ggml_tensor *t_emb = new_timestep_embedding(work_ctx, NULL, timesteps, - diffusion_model.model_channels); // [N, model_channels] + struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, x_t); + struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, + 1); // [N, ] + struct ggml_tensor* t_emb = new_timestep_embedding(work_ctx, NULL, timesteps, + diffusion_model.model_channels); // [N, model_channels] diffusion_model.alloc_compute_buffer(noised_input, c, t_emb, c_vector); bool has_unconditioned = cfg_scale != 1.0 && uc != NULL; @@ -768,31 +765,31 @@ class StableDiffusionGGML { } // denoise wrapper - struct ggml_tensor *out_cond = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor *out_uncond = NULL; + struct ggml_tensor* out_cond = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor* out_uncond = NULL; if (has_unconditioned) { out_uncond = ggml_dup_tensor(work_ctx, x); } - struct ggml_tensor *denoised = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x); - auto denoise = [&](ggml_tensor *input, float sigma, int step) { + auto denoise = [&](ggml_tensor* input, float sigma, int step) { if (step == 1) { - pretty_progress(0, (int) steps, 0); + pretty_progress(0, (int)steps, 0); } int64_t t0 = ggml_time_us(); - float c_skip = 1.0f; - float c_out = 1.0f; - float c_in = 1.0f; + float c_skip = 1.0f; + float c_out = 1.0f; + float c_in = 1.0f; std::vector scaling = denoiser->get_scalings(sigma); if (scaling.size() == 3) { // CompVisVDenoiser c_skip = scaling[0]; - c_out = scaling[1]; - c_in = scaling[2]; + c_out = scaling[1]; + c_in = scaling[2]; } else { // CompVisDenoiser c_out = scaling[0]; - c_in = scaling[1]; + c_in = scaling[1]; } float t = denoiser->schedule->sigma_to_t(sigma); @@ -806,16 +803,16 @@ class StableDiffusionGGML { // cond diffusion_model.compute(out_cond, n_threads, noised_input, NULL, c, t_emb, c_vector); - float *negative_data = NULL; + float* negative_data = NULL; if (has_unconditioned) { // uncond diffusion_model.compute(out_uncond, n_threads, noised_input, NULL, uc, t_emb, uc_vector); - negative_data = (float *) out_uncond->data; + negative_data = (float*)out_uncond->data; } - float *vec_denoised = (float *) denoised->data; - float *vec_input = (float *) input->data; - float *positive_data = (float *) out_cond->data; - int ne_elements = (int) ggml_nelements(denoised); + float* vec_denoised = (float*)denoised->data; + float* vec_input = (float*)input->data; + float* positive_data = (float*)out_cond->data; + int ne_elements = (int)ggml_nelements(denoised); for (int i = 0; i < ne_elements; i++) { float latent_result = positive_data[i]; if (has_unconditioned) { @@ -828,7 +825,7 @@ class StableDiffusionGGML { } int64_t t1 = ggml_time_us(); if (step > 0) { - pretty_progress(step, (int) steps, (t1 - t0) / 1000000.f); + pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f); // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000); } }; @@ -836,8 +833,8 @@ class StableDiffusionGGML { // sample_euler_ancestral switch (method) { case EULER_A: { - struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); for (int i = 0; i < steps; i++) { float sigma = sigmas[i]; @@ -847,9 +844,9 @@ class StableDiffusionGGML { // d = (x - denoised) / sigma { - float *vec_d = (float *) d->data; - float *vec_x = (float *) x->data; - float *vec_denoised = (float *) denoised->data; + float* vec_d = (float*)d->data; + float* vec_x = (float*)x->data; + float* vec_denoised = (float*)denoised->data; for (int i = 0; i < ggml_nelements(d); i++) { vec_d[i] = (vec_x[i] - vec_denoised[i]) / sigma; @@ -857,18 +854,18 @@ class StableDiffusionGGML { } // get_ancestral_step - float sigma_up = std::min(sigmas[i + 1], - std::sqrt(sigmas[i + 1] * sigmas[i + 1] * - (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / - (sigmas[i] * sigmas[i]))); + float sigma_up = std::min(sigmas[i + 1], + std::sqrt(sigmas[i + 1] * sigmas[i + 1] * + (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / + (sigmas[i] * sigmas[i]))); float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up); // Euler method float dt = sigma_down - sigmas[i]; // x = x + d * dt { - float *vec_d = (float *) d->data; - float *vec_x = (float *) x->data; + float* vec_d = (float*)d->data; + float* vec_x = (float*)x->data; for (int i = 0; i < ggml_nelements(x); i++) { vec_x[i] = vec_x[i] + vec_d[i] * dt; @@ -880,8 +877,8 @@ class StableDiffusionGGML { ggml_tensor_set_f32_randn(noise, rng); // noise = load_tensor_from_file(work_ctx, "./rand" + std::to_string(i+1) + ".bin"); { - float *vec_x = (float *) x->data; - float *vec_noise = (float *) noise->data; + float* vec_x = (float*)x->data; + float* vec_noise = (float*)noise->data; for (int i = 0; i < ggml_nelements(x); i++) { vec_x[i] = vec_x[i] + vec_noise[i] * sigma_up; @@ -889,11 +886,10 @@ class StableDiffusionGGML { } } } - } - break; + } break; case EULER: // Implemented without any sigma churn { - struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); for (int i = 0; i < steps; i++) { float sigma = sigmas[i]; @@ -903,9 +899,9 @@ class StableDiffusionGGML { // d = (x - denoised) / sigma { - float *vec_d = (float *) d->data; - float *vec_x = (float *) x->data; - float *vec_denoised = (float *) denoised->data; + float* vec_d = (float*)d->data; + float* vec_x = (float*)x->data; + float* vec_denoised = (float*)denoised->data; for (int j = 0; j < ggml_nelements(d); j++) { vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigma; @@ -915,19 +911,18 @@ class StableDiffusionGGML { float dt = sigmas[i + 1] - sigma; // x = x + d * dt { - float *vec_d = (float *) d->data; - float *vec_x = (float *) x->data; + float* vec_d = (float*)d->data; + float* vec_x = (float*)x->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_x[j] = vec_x[j] + vec_d[j] * dt; } } } - } - break; + } break; case HEUN: { - struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor *x2 = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); for (int i = 0; i < steps; i++) { // denoise @@ -935,9 +930,9 @@ class StableDiffusionGGML { // d = (x - denoised) / sigma { - float *vec_d = (float *) d->data; - float *vec_x = (float *) x->data; - float *vec_denoised = (float *) denoised->data; + float* vec_d = (float*)d->data; + float* vec_x = (float*)x->data; + float* vec_denoised = (float*)denoised->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i]; @@ -948,25 +943,25 @@ class StableDiffusionGGML { if (sigmas[i + 1] == 0) { // Euler step // x = x + d * dt - float *vec_d = (float *) d->data; - float *vec_x = (float *) x->data; + float* vec_d = (float*)d->data; + float* vec_x = (float*)x->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_x[j] = vec_x[j] + vec_d[j] * dt; } } else { // Heun step - float *vec_d = (float *) d->data; - float *vec_d2 = (float *) d->data; - float *vec_x = (float *) x->data; - float *vec_x2 = (float *) x2->data; + float* vec_d = (float*)d->data; + float* vec_d2 = (float*)d->data; + float* vec_x = (float*)x->data; + float* vec_x2 = (float*)x2->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_x2[j] = vec_x[j] + vec_d[j] * dt; } denoise(x2, sigmas[i + 1], i + 1); - float *vec_denoised = (float *) denoised->data; + float* vec_denoised = (float*)denoised->data; for (int j = 0; j < ggml_nelements(x); j++) { float d2 = (vec_x2[j] - vec_denoised[j]) / sigmas[i + 1]; vec_d[j] = (vec_d[j] + d2) / 2; @@ -974,11 +969,10 @@ class StableDiffusionGGML { } } } - } - break; + } break; case DPM2: { - struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor *x2 = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); for (int i = 0; i < steps; i++) { // denoise @@ -986,9 +980,9 @@ class StableDiffusionGGML { // d = (x - denoised) / sigma { - float *vec_d = (float *) d->data; - float *vec_x = (float *) x->data; - float *vec_denoised = (float *) denoised->data; + float* vec_d = (float*)d->data; + float* vec_x = (float*)x->data; + float* vec_denoised = (float*)denoised->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i]; @@ -998,9 +992,9 @@ class StableDiffusionGGML { if (sigmas[i + 1] == 0) { // Euler step // x = x + d * dt - float dt = sigmas[i + 1] - sigmas[i]; - float *vec_d = (float *) d->data; - float *vec_x = (float *) x->data; + float dt = sigmas[i + 1] - sigmas[i]; + float* vec_d = (float*)d->data; + float* vec_x = (float*)x->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_x[j] = vec_x[j] + vec_d[j] * dt; @@ -1008,18 +1002,18 @@ class StableDiffusionGGML { } else { // DPM-Solver-2 float sigma_mid = exp(0.5f * (log(sigmas[i]) + log(sigmas[i + 1]))); - float dt_1 = sigma_mid - sigmas[i]; - float dt_2 = sigmas[i + 1] - sigmas[i]; + float dt_1 = sigma_mid - sigmas[i]; + float dt_2 = sigmas[i + 1] - sigmas[i]; - float *vec_d = (float *) d->data; - float *vec_x = (float *) x->data; - float *vec_x2 = (float *) x2->data; + float* vec_d = (float*)d->data; + float* vec_x = (float*)x->data; + float* vec_x2 = (float*)x2->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_x2[j] = vec_x[j] + vec_d[j] * dt_1; } denoise(x2, sigma_mid, i + 1); - float *vec_denoised = (float *) denoised->data; + float* vec_denoised = (float*)denoised->data; for (int j = 0; j < ggml_nelements(x); j++) { float d2 = (vec_x2[j] - vec_denoised[j]) / sigma_mid; vec_x[j] = vec_x[j] + d2 * dt_2; @@ -1027,31 +1021,30 @@ class StableDiffusionGGML { } } - } - break; + } break; case DPMPP2S_A: { - struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor *x2 = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); for (int i = 0; i < steps; i++) { // denoise denoise(x, sigmas[i], i + 1); // get_ancestral_step - float sigma_up = std::min(sigmas[i + 1], - std::sqrt(sigmas[i + 1] * sigmas[i + 1] * - (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / - (sigmas[i] * sigmas[i]))); + float sigma_up = std::min(sigmas[i + 1], + std::sqrt(sigmas[i + 1] * sigmas[i + 1] * + (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / + (sigmas[i] * sigmas[i]))); float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up); - auto t_fn = [](float sigma) -> float { return -log(sigma); }; - auto sigma_fn = [](float t) -> float { return exp(-t); }; + auto t_fn = [](float sigma) -> float { return -log(sigma); }; + auto sigma_fn = [](float t) -> float { return exp(-t); }; if (sigma_down == 0) { // Euler step - float *vec_d = (float *) d->data; - float *vec_x = (float *) x->data; - float *vec_denoised = (float *) denoised->data; + float* vec_d = (float*)d->data; + float* vec_x = (float*)x->data; + float* vec_denoised = (float*)denoised->data; for (int j = 0; j < ggml_nelements(d); j++) { vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i]; @@ -1067,15 +1060,15 @@ class StableDiffusionGGML { } } else { // DPM-Solver++(2S) - float t = t_fn(sigmas[i]); + float t = t_fn(sigmas[i]); float t_next = t_fn(sigma_down); - float h = t_next - t; - float s = t + 0.5f * h; + float h = t_next - t; + float s = t + 0.5f * h; - float *vec_d = (float *) d->data; - float *vec_x = (float *) x->data; - float *vec_x2 = (float *) x2->data; - float *vec_denoised = (float *) denoised->data; + float* vec_d = (float*)d->data; + float* vec_x = (float*)x->data; + float* vec_x2 = (float*)x2->data; + float* vec_denoised = (float*)denoised->data; // First half-step for (int j = 0; j < ggml_nelements(x); j++) { @@ -1094,8 +1087,8 @@ class StableDiffusionGGML { if (sigmas[i + 1] > 0) { ggml_tensor_set_f32_randn(noise, rng); { - float *vec_x = (float *) x->data; - float *vec_noise = (float *) noise->data; + float* vec_x = (float*)x->data; + float* vec_noise = (float*)noise->data; for (int i = 0; i < ggml_nelements(x); i++) { vec_x[i] = vec_x[i] + vec_noise[i] * sigma_up; @@ -1103,11 +1096,10 @@ class StableDiffusionGGML { } } } - } - break; + } break; case DPMPP2M: // DPM++ (2M) from Karras et al (2022) { - struct ggml_tensor *old_denoised = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x); auto t_fn = [](float sigma) -> float { return -log(sigma); }; @@ -1115,14 +1107,14 @@ class StableDiffusionGGML { // denoise denoise(x, sigmas[i], i + 1); - float t = t_fn(sigmas[i]); - float t_next = t_fn(sigmas[i + 1]); - float h = t_next - t; - float a = sigmas[i + 1] / sigmas[i]; - float b = exp(-h) - 1.f; - float *vec_x = (float *) x->data; - float *vec_denoised = (float *) denoised->data; - float *vec_old_denoised = (float *) old_denoised->data; + float t = t_fn(sigmas[i]); + float t_next = t_fn(sigmas[i + 1]); + float h = t_next - t; + float a = sigmas[i + 1] / sigmas[i]; + float b = exp(-h) - 1.f; + float* vec_x = (float*)x->data; + float* vec_denoised = (float*)denoised->data; + float* vec_old_denoised = (float*)old_denoised->data; if (i == 0 || sigmas[i + 1] == 0) { // Simpler step for the edge cases @@ -1131,10 +1123,10 @@ class StableDiffusionGGML { } } else { float h_last = t - t_fn(sigmas[i - 1]); - float r = h_last / h; + float r = h_last / h; for (int j = 0; j < ggml_nelements(x); j++) { float denoised_d = - (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j]; + (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j]; vec_x[j] = a * vec_x[j] - b * denoised_d; } } @@ -1144,11 +1136,10 @@ class StableDiffusionGGML { vec_old_denoised[j] = vec_denoised[j]; } } - } - break; + } break; case DPMPP2Mv2: // Modified DPM++ (2M) from https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457 { - struct ggml_tensor *old_denoised = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x); auto t_fn = [](float sigma) -> float { return -log(sigma); }; @@ -1156,13 +1147,13 @@ class StableDiffusionGGML { // denoise denoise(x, sigmas[i], i + 1); - float t = t_fn(sigmas[i]); - float t_next = t_fn(sigmas[i + 1]); - float h = t_next - t; - float a = sigmas[i + 1] / sigmas[i]; - float *vec_x = (float *) x->data; - float *vec_denoised = (float *) denoised->data; - float *vec_old_denoised = (float *) old_denoised->data; + float t = t_fn(sigmas[i]); + float t_next = t_fn(sigmas[i + 1]); + float h = t_next - t; + float a = sigmas[i + 1] / sigmas[i]; + float* vec_x = (float*)x->data; + float* vec_denoised = (float*)denoised->data; + float* vec_old_denoised = (float*)old_denoised->data; if (i == 0 || sigmas[i + 1] == 0) { // Simpler step for the edge cases @@ -1172,14 +1163,14 @@ class StableDiffusionGGML { } } else { float h_last = t - t_fn(sigmas[i - 1]); - float h_min = std::min(h_last, h); - float h_max = std::max(h_last, h); - float r = h_max / h_min; - float h_d = (h_max + h_min) / 2.f; - float b = exp(-h_d) - 1.f; + float h_min = std::min(h_last, h); + float h_max = std::max(h_last, h); + float r = h_max / h_min; + float h_d = (h_max + h_min) / 2.f; + float b = exp(-h_d) - 1.f; for (int j = 0; j < ggml_nelements(x); j++) { float denoised_d = - (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j]; + (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j]; vec_x[j] = a * vec_x[j] - b * denoised_d; } } @@ -1189,12 +1180,11 @@ class StableDiffusionGGML { vec_old_denoised[j] = vec_denoised[j]; } } - } - break; + } break; case LCM: // Latent Consistency Models { - struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); for (int i = 0; i < steps; i++) { float sigma = sigmas[i]; @@ -1204,8 +1194,8 @@ class StableDiffusionGGML { // x = denoised { - float *vec_x = (float *) x->data; - float *vec_denoised = (float *) denoised->data; + float* vec_x = (float*)x->data; + float* vec_denoised = (float*)denoised->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_x[j] = vec_denoised[j]; } @@ -1216,8 +1206,8 @@ class StableDiffusionGGML { ggml_tensor_set_f32_randn(noise, rng); // noise = load_tensor_from_file(res_ctx, "./rand" + std::to_string(i+1) + ".bin"); { - float *vec_x = (float *) x->data; - float *vec_noise = (float *) noise->data; + float* vec_x = (float*)x->data; + float* vec_noise = (float*)noise->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_x[j] = vec_x[j] + sigmas[i + 1] * vec_noise[j]; @@ -1225,8 +1215,7 @@ class StableDiffusionGGML { } } } - } - break; + } break; default: LOG_ERROR("Attempting to sample with nonexisting sample method %i", method); @@ -1237,28 +1226,28 @@ class StableDiffusionGGML { } // ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding - ggml_tensor *get_first_stage_encoding(ggml_context *work_ctx, ggml_tensor *moments) { + ggml_tensor* get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) { // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample - ggml_tensor *latent = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], - moments->ne[2] / 2, moments->ne[3]); - struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, latent); + ggml_tensor* latent = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], + moments->ne[2] / 2, moments->ne[3]); + struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latent); ggml_tensor_set_f32_randn(noise, rng); // noise = load_tensor_from_file(work_ctx, "noise.bin"); { - float mean = 0; + float mean = 0; float logvar = 0; - float value = 0; - float std_ = 0; + float value = 0; + float std_ = 0; for (int i = 0; i < latent->ne[3]; i++) { for (int j = 0; j < latent->ne[2]; j++) { for (int k = 0; k < latent->ne[1]; k++) { for (int l = 0; l < latent->ne[0]; l++) { - mean = ggml_tensor_get_f32(moments, l, k, j, i); - logvar = ggml_tensor_get_f32(moments, l, k, j + (int) latent->ne[2], i); + mean = ggml_tensor_get_f32(moments, l, k, j, i); + logvar = ggml_tensor_get_f32(moments, l, k, j + (int)latent->ne[2], i); logvar = std::max(-30.0f, std::min(logvar, 20.0f)); - std_ = std::exp(0.5f * logvar); - value = mean + std_ * ggml_tensor_get_f32(noise, l, k, j, i); - value = value * scale_factor; + std_ = std::exp(0.5f * logvar); + value = mean + std_ * ggml_tensor_get_f32(noise, l, k, j, i); + value = value * scale_factor; // printf("%d %d %d %d -> %f\n", i, j, k, l, value); ggml_tensor_set_f32(latent, value, l, k, j, i); } @@ -1269,14 +1258,14 @@ class StableDiffusionGGML { return latent; } - ggml_tensor *compute_first_stage(ggml_context *work_ctx, ggml_tensor *x, bool decode) { - int64_t W = x->ne[0]; - int64_t H = x->ne[1]; - ggml_tensor *result = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, + ggml_tensor* compute_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode) { + int64_t W = x->ne[0]; + int64_t H = x->ne[1]; + ggml_tensor* result = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, decode ? (W * 8) : (W / 8), // width decode ? (H * 8) : (H / 8), // height decode ? 3 : (use_tiny_autoencoder ? 4 : 8)); // channels - int64_t t0 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); if (!use_tiny_autoencoder) { if (decode) { ggml_tensor_scale(x, 1.0f / scale_factor); @@ -1285,7 +1274,7 @@ class StableDiffusionGGML { } if (vae_tiling && decode) { // TODO: support tiling vae encode // split latent in 32x32 tiles and compute in several steps - auto on_tiling = [&](ggml_tensor *in, ggml_tensor *out, bool init) { + auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { if (init) { first_stage_model.alloc_compute_buffer(in, decode); } else { @@ -1304,7 +1293,7 @@ class StableDiffusionGGML { } else { if (vae_tiling && decode) { // TODO: support tiling vae encode // split latent in 64x64 tiles and compute in several steps - auto on_tiling = [&](ggml_tensor *in, ggml_tensor *out, bool init) { + auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { if (init) { tae_first_stage.alloc_compute_buffer(in, decode); } else { @@ -1327,11 +1316,11 @@ class StableDiffusionGGML { return result; } - ggml_tensor *encode_first_stage(ggml_context *work_ctx, ggml_tensor *x) { + ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x) { return compute_first_stage(work_ctx, x, false); } - ggml_tensor *decode_first_stage(ggml_context *work_ctx, ggml_tensor *x) { + ggml_tensor* decode_first_stage(ggml_context* work_ctx, ggml_tensor* x) { return compute_first_stage(work_ctx, x, true); } }; @@ -1339,19 +1328,19 @@ class StableDiffusionGGML { /*================================================= SD API ==================================================*/ struct sd_ctx_t { - StableDiffusionGGML *sd = NULL; + StableDiffusionGGML* sd = NULL; }; -sd_ctx_t *new_sd_ctx(int n_threads, +sd_ctx_t* new_sd_ctx(int n_threads, bool vae_decode_only, bool free_params_immediately, - const char *lora_model_dir_c_str, + const char* lora_model_dir_c_str, enum rng_type_t rng_type, bool vae_tiling, enum sd_type_t wtype, enum schedule_t s, bool init_backend_immediately) { - sd_ctx_t *sd_ctx = (sd_ctx_t *) malloc(sizeof(sd_ctx_t)); + sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t)); if (sd_ctx == NULL) { return NULL; } @@ -1365,12 +1354,11 @@ sd_ctx_t *new_sd_ctx(int n_threads, vae_tiling, static_cast(wtype), s, - init_backend_immediately - ); + init_backend_immediately); return sd_ctx; } -void free_sd_ctx(sd_ctx_t *sd_ctx) { +void free_sd_ctx(sd_ctx_t* sd_ctx) { if (sd_ctx->sd != NULL) { delete sd_ctx->sd; sd_ctx->sd = NULL; @@ -1378,7 +1366,7 @@ void free_sd_ctx(sd_ctx_t *sd_ctx) { free(sd_ctx); } -void init_backend(sd_ctx_t *sd_ctx) { +void init_backend(sd_ctx_t* sd_ctx) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return; @@ -1386,41 +1374,39 @@ void init_backend(sd_ctx_t *sd_ctx) { sd_ctx->sd->init_backend(); } -void set_options(sd_ctx_t *sd_ctx, +void set_options(sd_ctx_t* sd_ctx, int n_threads, bool vae_decode_only, bool free_params_immediately, - const char *lora_model_dir, + const char* lora_model_dir, rng_type_t rng_type, bool vae_tiling, sd_type_t wtype, - schedule_t schedule -) { + schedule_t schedule) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return; } sd_ctx->sd->set_options( - n_threads, - vae_decode_only, - free_params_immediately, - std::string(lora_model_dir), - rng_type, - vae_tiling, - wtype, - schedule - ); + n_threads, + vae_decode_only, + free_params_immediately, + std::string(lora_model_dir), + rng_type, + vae_tiling, + wtype, + schedule); } -bool load_clip_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix) { +bool load_clip_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* prefix) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return false; } - return sd_ctx->sd->load_clip_from_file(std::string(model_path), true,std::string(prefix)); + return sd_ctx->sd->load_clip_from_file(std::string(model_path), true, std::string(prefix)); } -void free_clip_params(sd_ctx_t *sd_ctx) { +void free_clip_params(sd_ctx_t* sd_ctx) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return; @@ -1428,7 +1414,7 @@ void free_clip_params(sd_ctx_t *sd_ctx) { sd_ctx->sd->free_clip_params(); } -bool load_unet_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix) { +bool load_unet_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* prefix) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return false; @@ -1436,7 +1422,7 @@ bool load_unet_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *p return sd_ctx->sd->load_unet_from_file(std::string(model_path), true, std::string(prefix)); } -void free_unet_params(sd_ctx_t *sd_ctx) { +void free_unet_params(sd_ctx_t* sd_ctx) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return; @@ -1444,7 +1430,7 @@ void free_unet_params(sd_ctx_t *sd_ctx) { sd_ctx->sd->free_unet_params(); } -bool load_vae_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix) { +bool load_vae_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* prefix) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return false; @@ -1452,7 +1438,7 @@ bool load_vae_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *pr return sd_ctx->sd->load_vae_from_file(std::string(model_path), true, std::string(prefix)); } -void free_vae_params(sd_ctx_t *sd_ctx) { +void free_vae_params(sd_ctx_t* sd_ctx) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return; @@ -1460,7 +1446,7 @@ void free_vae_params(sd_ctx_t *sd_ctx) { sd_ctx->sd->free_vae_params(); } -bool load_taesd_from_file(sd_ctx_t *sd_ctx, const char *model_path) { +bool load_taesd_from_file(sd_ctx_t* sd_ctx, const char* model_path) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return false; @@ -1468,7 +1454,7 @@ bool load_taesd_from_file(sd_ctx_t *sd_ctx, const char *model_path) { return sd_ctx->sd->load_taesd_from_file(std::string(model_path)); } -void free_taesd_params(sd_ctx_t *sd_ctx) { +void free_taesd_params(sd_ctx_t* sd_ctx) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return; @@ -1477,7 +1463,7 @@ void free_taesd_params(sd_ctx_t *sd_ctx) { } // load all model from one file -bool load_diffusions_from_file(sd_ctx_t *sd_ctx, const char *model_path) { +bool load_diffusions_from_file(sd_ctx_t* sd_ctx, const char* model_path) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return false; @@ -1486,7 +1472,7 @@ bool load_diffusions_from_file(sd_ctx_t *sd_ctx, const char *model_path) { } // free all model from one file -void free_diffusions_params(sd_ctx_t *sd_ctx) { +void free_diffusions_params(sd_ctx_t* sd_ctx) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return; @@ -1494,9 +1480,9 @@ void free_diffusions_params(sd_ctx_t *sd_ctx) { return sd_ctx->sd->free_diffusions_params(); } -sd_image_t *txt2img(sd_ctx_t *sd_ctx, - const char *prompt_c_str, - const char *negative_prompt_c_str, +sd_image_t* txt2img(sd_ctx_t* sd_ctx, + const char* prompt_c_str, + const char* negative_prompt_c_str, int clip_skip, float cfg_scale, int width, @@ -1514,10 +1500,10 @@ sd_image_t *txt2img(sd_ctx_t *sd_ctx, std::string negative_prompt(negative_prompt_c_str); // extract and remove lora - auto result_pair = extract_and_remove_lora(prompt); + auto result_pair = extract_and_remove_lora(prompt); std::unordered_map lora_f2m = result_pair.first; // lora_name -> multiplier - for (auto &kv: lora_f2m) { + for (auto& kv : lora_f2m) { LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second); } @@ -1533,10 +1519,10 @@ sd_image_t *txt2img(sd_ctx_t *sd_ctx, params.mem_size += width * height * 3 * sizeof(float); params.mem_size *= batch_count; params.mem_buffer = NULL; - params.no_alloc = false; + params.no_alloc = false; // LOG_DEBUG("mem_size %u ", params.mem_size); - struct ggml_context *work_ctx = ggml_init(params); + struct ggml_context* work_ctx = ggml_init(params); if (!work_ctx) { LOG_ERROR("ggml_init() failed"); return NULL; @@ -1546,16 +1532,16 @@ sd_image_t *txt2img(sd_ctx_t *sd_ctx, // Generally, when using the provided command line, the seed is always >0. // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library // by a third party with a seed <0, let's incorporate randomization here. - srand((int) time(NULL)); + srand((int)time(NULL)); seed = rand(); } - t0 = ggml_time_ms(); - auto cond_pair = sd_ctx->sd->get_learned_condition(work_ctx, prompt, clip_skip, width, height); - ggml_tensor *c = cond_pair.first; - ggml_tensor *c_vector = cond_pair.second; // [adm_in_channels, ] - struct ggml_tensor *uc = NULL; - struct ggml_tensor *uc_vector = NULL; + t0 = ggml_time_ms(); + auto cond_pair = sd_ctx->sd->get_learned_condition(work_ctx, prompt, clip_skip, width, height); + ggml_tensor* c = cond_pair.first; + ggml_tensor* c_vector = cond_pair.second; // [adm_in_channels, ] + struct ggml_tensor* uc = NULL; + struct ggml_tensor* uc_vector = NULL; if (cfg_scale != 1.0) { bool force_zero_embeddings = false; if (sd_ctx->sd->version == VERSION_XL && negative_prompt.size() == 0) { @@ -1563,8 +1549,8 @@ sd_image_t *txt2img(sd_ctx_t *sd_ctx, } auto uncond_pair = sd_ctx->sd->get_learned_condition(work_ctx, negative_prompt, clip_skip, width, height, force_zero_embeddings); - uc = uncond_pair.first; - uc_vector = uncond_pair.second; // [adm_in_channels, ] + uc = uncond_pair.first; + uc_vector = uncond_pair.second; // [adm_in_channels, ] } t1 = ggml_time_ms(); LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t1 - t0); @@ -1573,23 +1559,23 @@ sd_image_t *txt2img(sd_ctx_t *sd_ctx, sd_ctx->sd->cond_stage_model.free_params_buffer(); } - std::vector final_latents; // collect latents to decode + std::vector final_latents; // collect latents to decode int C = 4; int W = width / 8; int H = height / 8; LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]); for (int b = 0; b < batch_count; b++) { int64_t sampling_start = ggml_time_ms(); - int64_t cur_seed = seed + b; + int64_t cur_seed = seed + b; LOG_INFO("generating image: %i/%i - seed %i", b + 1, batch_count, cur_seed); sd_ctx->sd->rng->manual_seed(cur_seed); - struct ggml_tensor *x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); + struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); ggml_tensor_set_f32_randn(x_t, sd_ctx->sd->rng); std::vector sigmas = sd_ctx->sd->denoiser->schedule->get_sigmas(sample_steps); - struct ggml_tensor *x_0 = sd_ctx->sd->sample(work_ctx, x_t, NULL, c, c_vector, uc, uc_vector, cfg_scale, + struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx, x_t, NULL, c, c_vector, uc, uc_vector, cfg_scale, sample_method, sigmas); // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin"); // print_ggml_tensor(x_0); @@ -1606,10 +1592,10 @@ sd_image_t *txt2img(sd_ctx_t *sd_ctx, (t3 - t1) * 1.0f / 1000); LOG_INFO("decoding %zu latents", final_latents.size()); - std::vector decoded_images; // collect decoded images + std::vector decoded_images; // collect decoded images for (size_t i = 0; i < final_latents.size(); i++) { - t1 = ggml_time_ms(); - struct ggml_tensor *img = sd_ctx->sd->decode_first_stage(work_ctx, final_latents[i] /* x_0 */); + t1 = ggml_time_ms(); + struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, final_latents[i] /* x_0 */); // print_ggml_tensor(img); if (img != NULL) { decoded_images.push_back(img); @@ -1623,30 +1609,30 @@ sd_image_t *txt2img(sd_ctx_t *sd_ctx, if (sd_ctx->sd->free_params_immediately && !sd_ctx->sd->use_tiny_autoencoder) { sd_ctx->sd->first_stage_model.free_params_buffer(); } - sd_image_t *result_images = (sd_image_t *) calloc(batch_count, sizeof(sd_image_t)); + sd_image_t* result_images = (sd_image_t*)calloc(batch_count, sizeof(sd_image_t)); if (result_images == NULL) { ggml_free(work_ctx); return NULL; } for (size_t i = 0; i < decoded_images.size(); i++) { - result_images[i].width = width; - result_images[i].height = height; + result_images[i].width = width; + result_images[i].height = height; result_images[i].channel = 3; - result_images[i].data = sd_tensor_to_image(decoded_images[i]); + result_images[i].data = sd_tensor_to_image(decoded_images[i]); } ggml_free(work_ctx); LOG_INFO( - "txt2img completed in %.2fs", - (t4 - t0) * 1.0f / 1000); + "txt2img completed in %.2fs", + (t4 - t0) * 1.0f / 1000); return result_images; } -sd_image_t *img2img(sd_ctx_t *sd_ctx, +sd_image_t* img2img(sd_ctx_t* sd_ctx, sd_image_t init_image, - const char *prompt_c_str, - const char *negative_prompt_c_str, + const char* prompt_c_str, + const char* negative_prompt_c_str, int clip_skip, float cfg_scale, int width, @@ -1665,7 +1651,7 @@ sd_image_t *img2img(sd_ctx_t *sd_ctx, LOG_INFO("img2img %dx%d", width, height); std::vector sigmas = sd_ctx->sd->denoiser->schedule->get_sigmas(sample_steps); - size_t t_enc = static_cast(sample_steps * strength); + size_t t_enc = static_cast(sample_steps * strength); LOG_INFO("target t_enc is %zu steps", t_enc); std::vector sigma_sched; sigma_sched.assign(sigmas.begin() + sample_steps - t_enc - 1, sigmas.end()); @@ -1674,26 +1660,26 @@ sd_image_t *img2img(sd_ctx_t *sd_ctx, params.mem_size = static_cast(10 * 1024) * 1024; // 10 MB params.mem_size += width * height * 3 * sizeof(float) * 2; params.mem_buffer = NULL; - params.no_alloc = false; + params.no_alloc = false; // LOG_DEBUG("mem_size %u ", params.mem_size); // draft context - struct ggml_context *work_ctx = ggml_init(params); + struct ggml_context* work_ctx = ggml_init(params); if (!work_ctx) { LOG_ERROR("ggml_init() failed"); return NULL; } if (seed < 0) { - seed = (int) time(NULL); + seed = (int)time(NULL); } sd_ctx->sd->rng->manual_seed(seed); // extract and remove lora - auto result_pair = extract_and_remove_lora(prompt); + auto result_pair = extract_and_remove_lora(prompt); std::unordered_map lora_f2m = result_pair.first; // lora_name -> multiplier - for (auto &kv: lora_f2m) { + for (auto& kv : lora_f2m) { LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second); } prompt = result_pair.second; @@ -1705,13 +1691,13 @@ sd_image_t *img2img(sd_ctx_t *sd_ctx, int64_t t1 = ggml_time_ms(); LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); - ggml_tensor *init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); + ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); sd_image_to_tensor(init_image.data, init_img); - t0 = ggml_time_ms(); - ggml_tensor *init_latent = NULL; + t0 = ggml_time_ms(); + ggml_tensor* init_latent = NULL; if (!sd_ctx->sd->use_tiny_autoencoder) { - ggml_tensor *moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img); - init_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); + ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img); + init_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); } else { init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img); } @@ -1719,11 +1705,11 @@ sd_image_t *img2img(sd_ctx_t *sd_ctx, t1 = ggml_time_ms(); LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); - auto cond_pair = sd_ctx->sd->get_learned_condition(work_ctx, prompt, clip_skip, width, height); - ggml_tensor *c = cond_pair.first; - ggml_tensor *c_vector = cond_pair.second; // [adm_in_channels, ] - struct ggml_tensor *uc = NULL; - struct ggml_tensor *uc_vector = NULL; + auto cond_pair = sd_ctx->sd->get_learned_condition(work_ctx, prompt, clip_skip, width, height); + ggml_tensor* c = cond_pair.first; + ggml_tensor* c_vector = cond_pair.second; // [adm_in_channels, ] + struct ggml_tensor* uc = NULL; + struct ggml_tensor* uc_vector = NULL; if (cfg_scale != 1.0) { bool force_zero_embeddings = false; if (sd_ctx->sd->version == VERSION_XL && negative_prompt.size() == 0) { @@ -1731,8 +1717,8 @@ sd_image_t *img2img(sd_ctx_t *sd_ctx, } auto uncond_pair = sd_ctx->sd->get_learned_condition(work_ctx, negative_prompt, clip_skip, width, height, force_zero_embeddings); - uc = uncond_pair.first; - uc_vector = uncond_pair.second; // [adm_in_channels, ] + uc = uncond_pair.first; + uc_vector = uncond_pair.second; // [adm_in_channels, ] } int64_t t2 = ggml_time_ms(); LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t2 - t1); @@ -1741,11 +1727,11 @@ sd_image_t *img2img(sd_ctx_t *sd_ctx, } sd_ctx->sd->rng->manual_seed(seed); - struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, init_latent); + struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, init_latent); ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng); LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]); - struct ggml_tensor *x_0 = sd_ctx->sd->sample(work_ctx, init_latent, noise, c, c_vector, uc, uc_vector, + struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx, init_latent, noise, c, c_vector, uc, uc_vector, cfg_scale, sample_method, sigma_sched); // struct ggml_tensor *x_0 = load_tensor_from_file(ctx, "samples_ddim.bin"); // print_ggml_tensor(x_0); @@ -1755,7 +1741,7 @@ sd_image_t *img2img(sd_ctx_t *sd_ctx, sd_ctx->sd->diffusion_model.free_params_buffer(); } - struct ggml_tensor *img = sd_ctx->sd->decode_first_stage(work_ctx, x_0); + struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, x_0); if (sd_ctx->sd->free_params_immediately && !sd_ctx->sd->use_tiny_autoencoder) { sd_ctx->sd->first_stage_model.free_params_buffer(); } @@ -1764,17 +1750,17 @@ sd_image_t *img2img(sd_ctx_t *sd_ctx, return NULL; } - sd_image_t *result_images = (sd_image_t *) calloc(1, sizeof(sd_image_t)); + sd_image_t* result_images = (sd_image_t*)calloc(1, sizeof(sd_image_t)); if (result_images == NULL) { ggml_free(work_ctx); return NULL; } for (size_t i = 0; i < 1; i++) { - result_images[i].width = width; - result_images[i].height = height; + result_images[i].width = width; + result_images[i].height = height; result_images[i].channel = 3; - result_images[i].data = sd_tensor_to_image(img); + result_images[i].data = sd_tensor_to_image(img); } ggml_free(work_ctx); diff --git a/stable-diffusion.h b/stable-diffusion.h index 0d59dce4..5a12543a 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -54,8 +54,8 @@ enum schedule_t { // same as enum ggml_type enum sd_type_t { - SD_TYPE_F32 = 0, - SD_TYPE_F16 = 1, + SD_TYPE_F32 = 0, + SD_TYPE_F16 = 1, SD_TYPE_Q4_0 = 2, SD_TYPE_Q4_1 = 3, // SD_TYPE_Q4_2 = 4, support has been removed @@ -65,12 +65,12 @@ enum sd_type_t { SD_TYPE_Q8_0 = 8, SD_TYPE_Q8_1 = 9, // k-quantizations - SD_TYPE_Q2_K = 10, - SD_TYPE_Q3_K = 11, - SD_TYPE_Q4_K = 12, - SD_TYPE_Q5_K = 13, - SD_TYPE_Q6_K = 14, - SD_TYPE_Q8_K = 15, + SD_TYPE_Q2_K = 10, + SD_TYPE_Q3_K = 11, + SD_TYPE_Q4_K = 12, + SD_TYPE_Q5_K = 13, + SD_TYPE_Q6_K = 14, + SD_TYPE_Q8_K = 15, SD_TYPE_IQ2_XXS = 16, SD_TYPE_I8, SD_TYPE_I16, @@ -78,7 +78,7 @@ enum sd_type_t { SD_TYPE_COUNT, }; -SD_API const char *sd_type_name(enum sd_type_t type); +SD_API const char* sd_type_name(enum sd_type_t type); enum sd_log_level_t { SD_LOG_DEBUG, @@ -87,36 +87,36 @@ enum sd_log_level_t { SD_LOG_ERROR }; -typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char *text, void *data); +typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data); -SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void *data); +SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data); SD_API int32_t get_num_physical_cores(); -SD_API const char *sd_get_system_info(); +SD_API const char* sd_get_system_info(); typedef struct { uint32_t width; uint32_t height; uint32_t channel; - uint8_t *data; + uint8_t* data; } sd_image_t; typedef struct sd_ctx_t sd_ctx_t; -SD_API sd_ctx_t *new_sd_ctx(int n_threads, +SD_API sd_ctx_t* new_sd_ctx(int n_threads, bool vae_decode_only, bool free_params_immediately, - const char *lora_model_dir_c_str, + const char* lora_model_dir_c_str, enum rng_type_t rng_type, bool vae_tiling, enum sd_type_t wtype, enum schedule_t s, bool init_backend_immediately = true); -SD_API void free_sd_ctx(sd_ctx_t *sd_ctx); +SD_API void free_sd_ctx(sd_ctx_t* sd_ctx); -SD_API sd_image_t *txt2img(sd_ctx_t *sd_ctx, - const char *prompt, - const char *negative_prompt, +SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx, + const char* prompt, + const char* negative_prompt, int clip_skip, float cfg_scale, int width, @@ -126,10 +126,10 @@ SD_API sd_image_t *txt2img(sd_ctx_t *sd_ctx, int64_t seed, int batch_count); -SD_API sd_image_t *img2img(sd_ctx_t *sd_ctx, +SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, sd_image_t init_image, - const char *prompt, - const char *negative_prompt, + const char* prompt, + const char* negative_prompt, int clip_skip, float cfg_scale, int width, @@ -142,46 +142,46 @@ SD_API sd_image_t *img2img(sd_ctx_t *sd_ctx, typedef struct upscaler_ctx_t upscaler_ctx_t; -SD_API upscaler_ctx_t *new_upscaler_ctx(const char *esrgan_path, +SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path, int n_threads, enum sd_type_t wtype); -SD_API void free_upscaler_ctx(upscaler_ctx_t *upscaler_ctx); +SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx); -SD_API sd_image_t upscale(upscaler_ctx_t *upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor); +SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor); -SD_API void init_backend(sd_ctx_t *sd_ctx); +SD_API void init_backend(sd_ctx_t* sd_ctx); -SD_API void set_options(sd_ctx_t *sd_ctx, +SD_API void set_options(sd_ctx_t* sd_ctx, int n_threads, bool vae_decode_only, bool free_params_immediately, - const char *lora_model_dir, + const char* lora_model_dir, rng_type_t rng_type, bool vae_tiling, sd_type_t wtype, schedule_t schedule); -SD_API bool load_clip_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix = "te."); +SD_API bool load_clip_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* prefix = "te."); -SD_API void free_clip_params(sd_ctx_t *sd_ctx); +SD_API void free_clip_params(sd_ctx_t* sd_ctx); -SD_API bool load_unet_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix = "unet."); +SD_API bool load_unet_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* prefix = "unet."); -SD_API void free_unet_params(sd_ctx_t *sd_ctx); +SD_API void free_unet_params(sd_ctx_t* sd_ctx); -SD_API bool load_vae_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix = "vae."); +SD_API bool load_vae_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* prefix = "vae."); -SD_API void free_vae_params(sd_ctx_t *sd_ctx); +SD_API void free_vae_params(sd_ctx_t* sd_ctx); -SD_API bool load_taesd_from_file(sd_ctx_t *sd_ctx, const char *model_path); +SD_API bool load_taesd_from_file(sd_ctx_t* sd_ctx, const char* model_path); -SD_API void free_taesd_params(sd_ctx_t *sd_ctx); +SD_API void free_taesd_params(sd_ctx_t* sd_ctx); -SD_API bool load_diffusions_from_file(sd_ctx_t *sd_ctx, const char *model_path); +SD_API bool load_diffusions_from_file(sd_ctx_t* sd_ctx, const char* model_path); -SD_API void free_diffusions_params(sd_ctx_t *sd_ctx); +SD_API void free_diffusions_params(sd_ctx_t* sd_ctx); -SD_API bool convert(const char *input_path, const char *vae_path, const char *output_path, sd_type_t output_type); +SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type); #ifdef __cplusplus } diff --git a/unet.hpp b/unet.hpp index 6b6e7439..19bb5553 100644 --- a/unet.hpp +++ b/unet.hpp @@ -390,7 +390,7 @@ struct SpatialTransformer { #if defined(SD_USE_FLASH_ATTENTION) && !defined(SD_USE_CUBLAS) && !defined(SD_USE_METAL) struct ggml_tensor* kqv = ggml_flash_attn(ctx, q, k, v, false); // [N * n_head, h * w, d_head] #else - struct ggml_tensor* kq = ggml_mul_mat(ctx, k, q); // [N * n_head, h * w, max_position] + struct ggml_tensor* kq = ggml_mul_mat(ctx, k, q); // [N * n_head, h * w, max_position] // kq = ggml_diag_mask_inf_inplace(ctx, kq, 0); kq = ggml_soft_max_inplace(ctx, kq); @@ -418,15 +418,15 @@ struct SpatialTransformer { { // GEGLU auto x_w = ggml_view_2d(ctx, - transformer.ff_0_proj_w, - transformer.ff_0_proj_w->ne[0], - transformer.ff_0_proj_w->ne[1] / 2, - transformer.ff_0_proj_w->nb[1], - 0); // [in_channels * 4, in_channels] + transformer.ff_0_proj_w, + transformer.ff_0_proj_w->ne[0], + transformer.ff_0_proj_w->ne[1] / 2, + transformer.ff_0_proj_w->nb[1], + 0); // [in_channels * 4, in_channels] auto x_b = ggml_view_1d(ctx, - transformer.ff_0_proj_b, - transformer.ff_0_proj_b->ne[0] / 2, - 0); // [in_channels * 4, in_channels] + transformer.ff_0_proj_b, + transformer.ff_0_proj_b->ne[0] / 2, + 0); // [in_channels * 4, in_channels] auto gate_w = ggml_view_2d(ctx, transformer.ff_0_proj_w, transformer.ff_0_proj_w->ne[0], From 8e973464174f8308ede35602c13f6349d5a6e8b9 Mon Sep 17 00:00:00 2001 From: Cyberhan123 <255542417@qq.com> Date: Wed, 24 Jan 2024 18:01:34 +0800 Subject: [PATCH 3/8] cli --- examples/cli/main.cpp | 537 +++++++++++++++++------------- stable-diffusion.cpp | 746 ++++++++++++++++++++++-------------------- 2 files changed, 702 insertions(+), 581 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index b08340b3..31893751 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include "stable-diffusion.h" @@ -17,49 +18,53 @@ #include "stb_image_write.h" -const char* rng_type_to_str[] = { - "std_default", - "cuda", +const char *rng_type_to_str[] = { + "std_default", + "cuda", }; // Names of the sampler method, same order as enum sample_method in stable-diffusion.h -const char* sample_method_str[] = { - "euler_a", - "euler", - "heun", - "dpm2", - "dpm++2s_a", - "dpm++2m", - "dpm++2mv2", - "lcm", +const char *sample_method_str[] = { + "euler_a", + "euler", + "heun", + "dpm2", + "dpm++2s_a", + "dpm++2m", + "dpm++2mv2", + "lcm", }; // Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h -const char* schedule_str[] = { - "default", - "discrete", - "karras", +const char *schedule_str[] = { + "default", + "discrete", + "karras", }; -const char* modes_str[] = { - "txt2img", - "img2img", - "convert", +const char *modes_str[] = { + "txt2img", + "img2img", + "convert", + "stream" }; enum SDMode { TXT2IMG, IMG2IMG, CONVERT, + STREAM, MODE_COUNT }; struct SDParams { int n_threads = -1; - SDMode mode = TXT2IMG; + SDMode mode = TXT2IMG; std::string model_path; std::string vae_path; + std::string clip_path; + std::string unet_path; std::string taesd_path; std::string esrgan_path; sd_type_t wtype = SD_TYPE_COUNT; @@ -70,22 +75,23 @@ struct SDParams { std::string prompt; std::string negative_prompt; float cfg_scale = 7.0f; - int clip_skip = -1; // <= 0 represents unspecified - int width = 512; - int height = 512; + int clip_skip = -1; // <= 0 represents unspecified + int width = 512; + int height = 512; int batch_count = 1; sample_method_t sample_method = EULER_A; - schedule_t schedule = DEFAULT; - int sample_steps = 20; - float strength = 0.75f; - rng_type_t rng_type = CUDA_RNG; - int64_t seed = 42; - bool verbose = false; - bool vae_tiling = false; + schedule_t schedule = DEFAULT; + int sample_steps = 20; + float strength = 0.75f; + rng_type_t rng_type = CUDA_RNG; + int64_t seed = 42; + bool verbose = false; + bool vae_tiling = false; + bool vae_decode_only = false; }; -static std::string sd_basename(const std::string& path) { +static std::string sd_basename(const std::string &path) { size_t pos = path.find_last_of('/'); if (pos != std::string::npos) { return path.substr(pos + 1); @@ -104,6 +110,8 @@ void print_params(SDParams params) { printf(" model_path: %s\n", params.model_path.c_str()); printf(" wtype: %s\n", params.wtype < SD_TYPE_COUNT ? sd_type_name(params.wtype) : "unspecified"); printf(" vae_path: %s\n", params.vae_path.c_str()); + printf(" clip_path: %s\n", params.clip_path.c_str()); + printf(" unet_path: %s\n", params.unet_path.c_str()); printf(" taesd_path: %s\n", params.taesd_path.c_str()); printf(" esrgan_path: %s\n", params.esrgan_path.c_str()); printf(" output_path: %s\n", params.output_path.c_str()); @@ -124,16 +132,19 @@ void print_params(SDParams params) { printf(" vae_tiling: %s\n", params.vae_tiling ? "true" : "false"); } -void print_usage(int argc, const char* argv[]) { +void print_usage(int argc, const char *argv[]) { printf("usage: %s [arguments]\n", argv[0]); printf("\n"); printf("arguments:\n"); printf(" -h, --help show this help message and exit\n"); - printf(" -M, --mode [MODEL] run mode (txt2img or img2img or convert, default: txt2img)\n"); + printf(" -M, --mode [MODEL] run mode (txt2img or img2img or convert or stream, default: txt2img)\n"); printf(" -t, --threads N number of threads to use during computation (default: -1).\n"); printf(" If threads <= 0, then threads will be set to the number of CPU physical cores\n"); printf(" -m, --model [MODEL] path to model\n"); + printf(" If the path is directory, support load model from \"unet/diffusion_pytorch_model.safetensors\", \"vae/diffusion_pytorch_model.safetensors\",\"text_encoder/model.safetensors\"\n"); printf(" --vae [VAE] path to vae\n"); + printf(" --clip [CLIP] path to clip\n"); + printf(" --unet [UNET] path to unet\n"); printf(" --taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n"); printf(" --upscale-model [ESRGAN_PATH] path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now.\n"); printf(" --type [TYPE] weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0)\n"); @@ -148,7 +159,7 @@ void print_usage(int argc, const char* argv[]) { printf(" 1.0 corresponds to full destruction of information in init image\n"); printf(" -H, --height H image height, in pixel space (default: 512)\n"); printf(" -W, --width W image width, in pixel space (default: 512)\n"); - printf(" --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, lcm}\n"); + printf(" --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, lcm}\n"); printf(" sampling method (default: \"euler_a\")\n"); printf(" --steps STEPS number of sample steps (default: 20)\n"); printf(" --rng {std_default, cuda} RNG (default: cuda)\n"); @@ -161,7 +172,7 @@ void print_usage(int argc, const char* argv[]) { printf(" -v, --verbose print extra info\n"); } -void parse_args(int argc, const char** argv, SDParams& params) { +void parse_args(int argc, const char **argv, SDParams ¶ms) { bool invalid_arg = false; std::string arg; for (int i = 1; i < argc; i++) { @@ -178,19 +189,19 @@ void parse_args(int argc, const char** argv, SDParams& params) { invalid_arg = true; break; } - const char* mode_selected = argv[i]; - int mode_found = -1; + const char *mode_selected = argv[i]; + int mode_found = -1; for (int d = 0; d < MODE_COUNT; d++) { if (!strcmp(mode_selected, modes_str[d])) { mode_found = d; } } if (mode_found == -1) { - fprintf(stderr, "error: invalid mode %s, must be one of [txt2img, img2img]\n", + fprintf(stderr, "error: invalid mode %s, must be one of [txt2img, img2img, convert, txt2img]\n", mode_selected); exit(1); } - params.mode = (SDMode)mode_found; + params.mode = (SDMode) mode_found; } else if (arg == "-m" || arg == "--model") { if (++i >= argc) { invalid_arg = true; @@ -203,6 +214,18 @@ void parse_args(int argc, const char** argv, SDParams& params) { break; } params.vae_path = argv[i]; + } else if (arg == "--clip") { + if (++i >= argc) { + invalid_arg = true; + break; + } + params.vae_path = argv[i]; + } else if (arg == "--unet") { + if (++i >= argc) { + invalid_arg = true; + break; + } + params.vae_path = argv[i]; } else if (arg == "--taesd") { if (++i >= argc) { invalid_arg = true; @@ -334,8 +357,8 @@ void parse_args(int argc, const char** argv, SDParams& params) { invalid_arg = true; break; } - const char* schedule_selected = argv[i]; - int schedule_found = -1; + const char *schedule_selected = argv[i]; + int schedule_found = -1; for (int d = 0; d < N_SCHEDULES; d++) { if (!strcmp(schedule_selected, schedule_str[d])) { schedule_found = d; @@ -345,7 +368,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { invalid_arg = true; break; } - params.schedule = (schedule_t)schedule_found; + params.schedule = (schedule_t) schedule_found; } else if (arg == "-s" || arg == "--seed") { if (++i >= argc) { invalid_arg = true; @@ -357,8 +380,8 @@ void parse_args(int argc, const char** argv, SDParams& params) { invalid_arg = true; break; } - const char* sample_method_selected = argv[i]; - int sample_method_found = -1; + const char *sample_method_selected = argv[i]; + int sample_method_found = -1; for (int m = 0; m < N_SAMPLE_METHODS; m++) { if (!strcmp(sample_method_selected, sample_method_str[m])) { sample_method_found = m; @@ -368,7 +391,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { invalid_arg = true; break; } - params.sample_method = (sample_method_t)sample_method_found; + params.sample_method = (sample_method_t) sample_method_found; } else if (arg == "-h" || arg == "--help") { print_usage(argc, argv); exit(0); @@ -385,62 +408,65 @@ void parse_args(int argc, const char** argv, SDParams& params) { print_usage(argc, argv); exit(1); } + if (params.n_threads <= 0) { params.n_threads = get_num_physical_cores(); } - if (params.mode != CONVERT && params.prompt.length() == 0) { - fprintf(stderr, "error: the following arguments are required: prompt\n"); - print_usage(argc, argv); - exit(1); - } + if (params.mode != STREAM) { + if (params.mode != CONVERT && params.prompt.length() == 0) { + fprintf(stderr, "error: the following arguments are required: prompt\n"); + print_usage(argc, argv); + exit(1); + } - if (params.model_path.length() == 0) { - fprintf(stderr, "error: the following arguments are required: model_path\n"); - print_usage(argc, argv); - exit(1); - } + if (params.model_path.length() == 0) { + fprintf(stderr, "error: the following arguments are required: model_path\n"); + print_usage(argc, argv); + exit(1); + } - if (params.mode == IMG2IMG && params.input_path.length() == 0) { - fprintf(stderr, "error: when using the img2img mode, the following arguments are required: init-img\n"); - print_usage(argc, argv); - exit(1); - } + if (params.mode == IMG2IMG && params.input_path.length() == 0) { + fprintf(stderr, "error: when using the img2img mode, the following arguments are required: init-img\n"); + print_usage(argc, argv); + exit(1); + } - if (params.output_path.length() == 0) { - fprintf(stderr, "error: the following arguments are required: output_path\n"); - print_usage(argc, argv); - exit(1); - } + if (params.output_path.length() == 0) { + fprintf(stderr, "error: the following arguments are required: output_path\n"); + print_usage(argc, argv); + exit(1); + } - if (params.width <= 0 || params.width % 64 != 0) { - fprintf(stderr, "error: the width must be a multiple of 64\n"); - exit(1); - } + if (params.width <= 0 || params.width % 64 != 0) { + fprintf(stderr, "error: the width must be a multiple of 64\n"); + exit(1); + } - if (params.height <= 0 || params.height % 64 != 0) { - fprintf(stderr, "error: the height must be a multiple of 64\n"); - exit(1); - } + if (params.height <= 0 || params.height % 64 != 0) { + fprintf(stderr, "error: the height must be a multiple of 64\n"); + exit(1); + } - if (params.sample_steps <= 0) { - fprintf(stderr, "error: the sample_steps must be greater than 0\n"); - exit(1); - } + if (params.sample_steps <= 0) { + fprintf(stderr, "error: the sample_steps must be greater than 0\n"); + exit(1); + } - if (params.strength < 0.f || params.strength > 1.f) { - fprintf(stderr, "error: can only work with strength in [0.0, 1.0]\n"); - exit(1); - } + if (params.strength < 0.f || params.strength > 1.f) { + fprintf(stderr, "error: can only work with strength in [0.0, 1.0]\n"); + exit(1); + } - if (params.seed < 0) { - srand((int)time(NULL)); - params.seed = rand(); - } + if (params.seed < 0) { + srand((int) time(NULL)); + params.seed = rand(); + } - if (params.mode == CONVERT) { - if (params.output_path == "output.png") { - params.output_path = "output.gguf"; + if (params.mode == CONVERT) { + if (params.output_path == "output.png") { + params.output_path = "output.gguf"; + } } } } @@ -465,8 +491,8 @@ std::string get_image_params(SDParams params, int64_t seed) { return parameter_string; } -void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) { - SDParams* params = (SDParams*)data; +void sd_log_cb(enum sd_log_level_t level, const char *log, void *data) { + SDParams *params = (SDParams *) data; if (!params->verbose && level <= SD_LOG_DEBUG) { return; } @@ -479,182 +505,243 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) { } } -int main(int argc, const char* argv[]) { - SDParams params; - parse_args(argc, argv, params); +std::vector parse_cin(std::string &input, std::vector ignore_args) { + std::vector inputTokens; + std::string token; + std::istringstream iss(input); - sd_set_log_callback(sd_log_cb, (void*)¶ms); - - if (params.verbose) { - print_params(params); - printf("%s", sd_get_system_info()); + std::string word; + while (iss >> word) { + inputTokens.push_back(word); } - if (params.mode == CONVERT) { - bool success = convert(params.model_path.c_str(), - params.vae_path.c_str(), - params.output_path.c_str(), - params.wtype); - if (!success) { - fprintf(stderr, - "convert '%s'/'%s' to '%s' failed\n", - params.model_path.c_str(), - params.vae_path.c_str(), - params.output_path.c_str()); - return 1; - } else { - printf("convert '%s'/'%s' to '%s' success\n", - params.model_path.c_str(), - params.vae_path.c_str(), - params.output_path.c_str()); - return 0; + std::vector commands; + for (int i = 0; i < inputTokens.size(); i++) { + + if (std::find(ignore_args.begin(), ignore_args.end(), inputTokens[i]) != ignore_args.end()) { + i++; + continue; } + commands.push_back(inputTokens[i]); } + return commands; +} - bool vae_decode_only = true; - uint8_t* input_image_buffer = NULL; - if (params.mode == IMG2IMG) { - vae_decode_only = false; +class CliInstance { +public: + sd_ctx_t *sd_ctx; - int c = 0; + ~CliInstance() { + free_sd_ctx(sd_ctx); + } + + CliInstance(const SDParams ¶ms) { + sd_ctx = new_sd_ctx( + params.n_threads, + params.vae_decode_only, + true, + params.lora_model_dir.c_str(), + params.rng_type, + params.vae_tiling, + params.wtype, + params.schedule, + true); + } + + //TODO: dynamic load model + + void txtimg(SDParams ¶ms) { + set_options(sd_ctx, params.n_threads, + params.vae_decode_only, + true, + params.lora_model_dir.c_str(), + params.rng_type, + params.vae_tiling, + params.wtype, + params.schedule); + sd_image_t *results = txt2img(sd_ctx, + params.prompt.c_str(), + params.negative_prompt.c_str(), + params.clip_skip, + params.cfg_scale, + params.width, + params.height, + params.sample_method, + params.sample_steps, + params.seed, + params.batch_count); + results = upscaler(params, results); + save_image(params, results); + + } + + void imgimg(SDParams ¶ms) { + set_options(sd_ctx, params.n_threads, + params.vae_decode_only, + true, + params.lora_model_dir.c_str(), + params.rng_type, + params.vae_tiling, + params.wtype, + params.schedule); + uint8_t *input_image_buffer = NULL; + + int c = 0; input_image_buffer = stbi_load(params.input_path.c_str(), ¶ms.width, ¶ms.height, &c, 3); if (input_image_buffer == NULL) { fprintf(stderr, "load image from '%s' failed\n", params.input_path.c_str()); - return 1; + return; } if (c != 3) { fprintf(stderr, "input image must be a 3 channels RGB image, but got %d channels\n", c); free(input_image_buffer); - return 1; + return; } if (params.width <= 0 || params.width % 64 != 0) { fprintf(stderr, "error: the width of image must be a multiple of 64\n"); free(input_image_buffer); - return 1; + return; } + if (params.height <= 0 || params.height % 64 != 0) { fprintf(stderr, "error: the height of image must be a multiple of 64\n"); free(input_image_buffer); - return 1; + return; } - } - sd_ctx_t* sd_ctx = new_sd_ctx( - params.n_threads, - vae_decode_only, - true, - params.lora_model_dir.c_str(), - params.rng_type, - params.vae_tiling, - params.wtype, - params.schedule, - true); - - if (sd_ctx == NULL) { - printf("new_sd_ctx_t failed\n"); - return 1; - } + sd_image_t input_image = {(uint32_t) params.width, + (uint32_t) params.height, + 3, + input_image_buffer}; - if (!load_diffusions_from_file(sd_ctx, params.model_path.c_str())) { - printf("load diffusions model failed\n"); - return 1; + sd_image_t *results = img2img(sd_ctx, + input_image, + params.prompt.c_str(), + params.negative_prompt.c_str(), + params.clip_skip, + params.cfg_scale, + params.width, + params.height, + params.sample_method, + params.sample_steps, + params.strength, + params.seed, + params.batch_count); + results = upscaler(params, results); + save_image(params, results); } - if (!params.taesd_path.empty()) { - free_unet_params(sd_ctx); - if (!load_taesd_from_file(sd_ctx, params.taesd_path.c_str())) { - printf("load taesd model failed\n"); - return 1; +protected: + + void save_image(const SDParams ¶ms, sd_image_t *results) { + size_t last = params.output_path.find_last_of("."); + std::string dummy_name = last != std::string::npos ? params.output_path.substr(0, last) : params.output_path; + for (int i = 0; i < params.batch_count; i++) { + if (results[i].data == NULL) { + continue; + } + std::string final_image_path = + i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ".png" : dummy_name + ".png"; + stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel, + results[i].data, 0, get_image_params(params, params.seed + i).c_str()); + printf("save result image to '%s'\n", final_image_path.c_str()); + free(results[i].data); + results[i].data = NULL; } + free(results); } - if (!params.vae_path.empty()) { - free_vae_params(sd_ctx); - if (!load_vae_from_file(sd_ctx, params.vae_path.c_str())) { - printf("load vae model failed\n"); - return 1; + sd_image_t *upscaler(const SDParams ¶ms, sd_image_t *results) { + int upscale_factor = 4; // unused for RealESRGAN_x4plus_anime_6B.pth + if (params.esrgan_path.size() > 0) { + upscaler_ctx_t *upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(), + params.n_threads, + params.wtype); + + if (upscaler_ctx == NULL) { + printf("new_upscaler_ctx failed\n"); + } else { + for (int i = 0; i < params.batch_count; i++) { + if (results[i].data == NULL) { + continue; + } + sd_image_t upscaled_image = upscale(upscaler_ctx, results[i], upscale_factor); + if (upscaled_image.data == NULL) { + printf("upscale failed\n"); + continue; + } + free(results[i].data); + results[i] = upscaled_image; + } + free_upscaler_ctx(upscaler_ctx); + } } + return results; } +}; - sd_image_t* results; - if (params.mode == TXT2IMG) { - results = txt2img(sd_ctx, - params.prompt.c_str(), - params.negative_prompt.c_str(), - params.clip_skip, - params.cfg_scale, - params.width, - params.height, - params.sample_method, - params.sample_steps, - params.seed, - params.batch_count); - } else { - sd_image_t input_image = {(uint32_t)params.width, - (uint32_t)params.height, - 3, - input_image_buffer}; +int main(int argc, const char *argv[]) { + SDParams params; + parse_args(argc, argv, params); - results = img2img(sd_ctx, - input_image, - params.prompt.c_str(), - params.negative_prompt.c_str(), - params.clip_skip, - params.cfg_scale, - params.width, - params.height, - params.sample_method, - params.sample_steps, - params.strength, - params.seed, - params.batch_count); - } + sd_set_log_callback(sd_log_cb, (void *) ¶ms); - if (results == NULL) { - printf("generate failed\n"); - free_sd_ctx(sd_ctx); - return 1; + if (params.verbose) { + print_params(params); + printf("%s", sd_get_system_info()); } - int upscale_factor = 4; // unused for RealESRGAN_x4plus_anime_6B.pth - if (params.esrgan_path.size() > 0) { - upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(), - params.n_threads, - params.wtype); - - if (upscaler_ctx == NULL) { - printf("new_upscaler_ctx failed\n"); + if (params.mode == CONVERT) { + bool success = convert(params.model_path.c_str(), + params.vae_path.c_str(), + params.output_path.c_str(), + params.wtype); + if (!success) { + fprintf(stderr, + "convert '%s'/'%s' to '%s' failed\n", + params.model_path.c_str(), + params.vae_path.c_str(), + params.output_path.c_str()); + return 1; } else { - for (int i = 0; i < params.batch_count; i++) { - if (results[i].data == NULL) { - continue; - } - sd_image_t upscaled_image = upscale(upscaler_ctx, results[i], upscale_factor); - if (upscaled_image.data == NULL) { - printf("upscale failed\n"); - continue; - } - free(results[i].data); - results[i] = upscaled_image; - } + printf("convert '%s'/'%s' to '%s' success\n", + params.model_path.c_str(), + params.vae_path.c_str(), + params.output_path.c_str()); + return 0; } } - size_t last = params.output_path.find_last_of("."); - std::string dummy_name = last != std::string::npos ? params.output_path.substr(0, last) : params.output_path; - for (int i = 0; i < params.batch_count; i++) { - if (results[i].data == NULL) { - continue; + auto instance = new CliInstance(params); + if (params.mode == STREAM) { + while (true) { + std::cout << "you are in stream model, take free to use txt2img or img2img" << std::endl; + std::string input; + std::getline(std::cin, input); + std::vector ignore_cmd = {""}; + auto args = parse_cin(input, ignore_cmd); + SDParams stream_params; + const char **args_c_arr = new const char *[args.size()]; + for (int i = 0; i < args.size(); ++i) { + args_c_arr[i] = args[i].c_str(); + } + parse_args(args.size(), args_c_arr, stream_params); + if (stream_params.mode == TXT2IMG) { + instance->txtimg(stream_params); + } else if (stream_params.mode == IMG2IMG) { + instance->imgimg(stream_params); + } else { + exit(1); + } + } + } else { + if (params.mode == TXT2IMG) { + instance->txtimg(params); + } else if (params.mode == IMG2IMG) { + instance->imgimg(params); + } else { + exit(1); } - std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ".png" : dummy_name + ".png"; - stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel, - results[i].data, 0, get_image_params(params, params.seed + i).c_str()); - printf("save result image to '%s'\n", final_image_path.c_str()); - free(results[i].data); - results[i].data = NULL; } - free(results); - free_sd_ctx(sd_ctx); - return 0; } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 28f5d8c8..e3090803 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -14,35 +14,35 @@ #include "unet.hpp" #include "vae.hpp" -const char* model_version_to_str[] = { - "1.x", - "2.x", - "XL", +const char *model_version_to_str[] = { + "1.x", + "2.x", + "XL", }; -const char* sampling_methods_str[] = { - "Euler A", - "Euler", - "Heun", - "DPM2", - "DPM++ (2s)", - "DPM++ (2M)", - "modified DPM++ (2M)", - "LCM", +const char *sampling_methods_str[] = { + "Euler A", + "Euler", + "Heun", + "DPM2", + "DPM++ (2s)", + "DPM++ (2M)", + "modified DPM++ (2M)", + "LCM", }; /*================================================== Helper Functions ================================================*/ -void calculate_alphas_cumprod(float* alphas_cumprod, +void calculate_alphas_cumprod(float *alphas_cumprod, float linear_start = 0.00085f, - float linear_end = 0.0120, - int timesteps = TIMESTEPS) { + float linear_end = 0.0120, + int timesteps = TIMESTEPS) { float ls_sqrt = sqrtf(linear_start); float le_sqrt = sqrtf(linear_end); - float amount = le_sqrt - ls_sqrt; + float amount = le_sqrt - ls_sqrt; float product = 1.0f; for (int i = 0; i < timesteps; i++) { - float beta = ls_sqrt + amount * ((float)i / (timesteps - 1)); + float beta = ls_sqrt + amount * ((float) i / (timesteps - 1)); product *= 1.0f - powf(beta, 2.0f); alphas_cumprod[i] = product; } @@ -53,20 +53,20 @@ void calculate_alphas_cumprod(float* alphas_cumprod, class StableDiffusionGGML { public: SDVersion version; - bool vae_decode_only = false; + bool vae_decode_only = false; bool free_params_immediately = false; std::shared_ptr rng = std::make_shared(); - int n_threads = -1; - float scale_factor = 0.18215f; + int n_threads = -1; + float scale_factor = 0.18215f; FrozenCLIPEmbedderWithCustomWords cond_stage_model; UNetModel diffusion_model; AutoEncoderKL first_stage_model; bool use_tiny_autoencoder = false; - bool vae_tiling = false; + bool vae_tiling = false; - std::map tensors; + std::map tensors; std::string lora_model_dir; // lora_name => multiplier @@ -74,13 +74,17 @@ class StableDiffusionGGML { std::map loras; std::shared_ptr denoiser = std::make_shared(); - schedule_t schedule = DEFAULT; + schedule_t schedule = DEFAULT; - ggml_backend_t backend = NULL; // general backend + ggml_backend_t backend = NULL; // general backend ggml_type model_data_type = GGML_TYPE_COUNT; // runtime weight type - ggml_type wtype = GGML_TYPE_COUNT; // options weight type + ggml_type wtype = GGML_TYPE_COUNT; // options weight type TinyAutoEncoder tae_first_stage; + + std::string clip_path; + std::string vae_path; + std::string unet_path; std::string taesd_path; ModelLoader model_loader; @@ -96,15 +100,15 @@ class StableDiffusionGGML { ggml_type wtype, schedule_t schedule, bool init_backend_immediately = true) - : n_threads(n_threads), - vae_decode_only(vae_decode_only), - free_params_immediately(free_params_immediately), - lora_model_dir(lora_model_dir), - vae_tiling(vae_tiling), - wtype(wtype), - schedule(schedule) { + : n_threads(n_threads), + vae_decode_only(vae_decode_only), + free_params_immediately(free_params_immediately), + lora_model_dir(lora_model_dir), + vae_tiling(vae_tiling), + wtype(wtype), + schedule(schedule) { first_stage_model.decode_only = vae_decode_only; - tae_first_stage.decode_only = vae_decode_only; + tae_first_stage.decode_only = vae_decode_only; if (rng_type == STD_DEFAULT_RNG) { rng = std::make_shared(); } else if (rng_type == CUDA_RNG) { @@ -151,22 +155,41 @@ class StableDiffusionGGML { bool vae_tiling, sd_type_t wtype, schedule_t schedule) { - this->n_threads = n_threads; - this->vae_decode_only = vae_decode_only; + this->n_threads = n_threads; + bool standalone=vae_path != clip_path && vae_path != unet_path; + if (this->vae_decode_only != vae_decode_only) { + this->vae_decode_only = vae_decode_only; + if (!vae_path.empty() && first_stage_model.params_buffer_size > 0) { + free_vae_params(); + std::string prefix; + if (standalone) { + prefix = ".vae"; + } + load_vae_from_file(vae_path, standalone, prefix); + } + } + this->free_params_immediately = free_params_immediately; - this->lora_model_dir = lora_model_dir; + this->lora_model_dir = std::move(lora_model_dir); if (rng_type == STD_DEFAULT_RNG) { rng = std::make_shared(); } else if (rng_type == CUDA_RNG) { rng = std::make_shared(); } this->vae_tiling = vae_tiling; - this->wtype = (ggml_type)wtype; - this->schedule = schedule; - apply_schedule(); + + if (this->wtype !=(ggml_type) wtype) { + this->wtype = (ggml_type) wtype; + // TODO: change wtype, need reload model + } + + if (this->schedule!=schedule){ + this->schedule = schedule; + apply_schedule(); + } } - bool load_clip_from_file(const std::string& model_path, bool standalone = true, const std::string& prefix = "te.") { + bool load_clip_from_file(const std::string &model_path, bool standalone = true, const std::string &prefix = "te.") { if (backend == NULL) { LOG_ERROR("if you set init_backend_immediately false, please call init_backend first"); return false; @@ -232,11 +255,11 @@ class StableDiffusionGGML { } struct ggml_init_params params; - params.mem_size = static_cast(3 * 1024) * 1024; // 10M + params.mem_size = static_cast(3 * 1024) * 1024; // 3M params.mem_buffer = NULL; - params.no_alloc = false; + params.no_alloc = false; // LOG_DEBUG("mem_size %u ", params.mem_size); - struct ggml_context* ctx = ggml_init(params); // for alphas_cumprod and is_using_v_parameterization check + struct ggml_context *ctx = ggml_init(params); // for alphas_cumprod and is_using_v_parameterization check if (!ctx) { LOG_ERROR("ggml_init() failed"); return false; @@ -246,10 +269,10 @@ class StableDiffusionGGML { LOG_DEBUG("loading clip weights"); int64_t t0 = ggml_time_ms(); - std::map tensors_need_to_load; + std::map tensors_need_to_load; std::set ignore_tensors; - for (auto& pair : tensors) { + for (auto &pair: tensors) { tensors_need_to_load.insert(pair); } @@ -264,6 +287,7 @@ class StableDiffusionGGML { int64_t t1 = ggml_time_ms(); LOG_INFO("loading clip model from '%s' completed, taking %.2fs", model_path.c_str(), (t1 - t0) * 1.0f / 1000); ggml_free(ctx); + clip_path = model_path; return true; } @@ -273,9 +297,9 @@ class StableDiffusionGGML { } } - bool load_unet_from_file(const std::string& model_path, - bool standalone = true, - const std::string& prefix = "unet.") { + bool load_unet_from_file(const std::string &model_path, + bool standalone = true, + const std::string &prefix = "unet.") { if (backend == NULL) { LOG_ERROR("if you set init_backend_immediately false, please call init_backend first"); return false; @@ -308,11 +332,11 @@ class StableDiffusionGGML { } struct ggml_init_params params; - params.mem_size = static_cast(3 * 1024) * 1024; // 10M + params.mem_size = static_cast(3 * 1024) * 1024; // 3M params.mem_buffer = NULL; - params.no_alloc = false; + params.no_alloc = false; - struct ggml_context* ctx = ggml_init(params); // for alphas_cumprod and is_using_v_parameterization check + struct ggml_context *ctx = ggml_init(params); // for alphas_cumprod and is_using_v_parameterization check if (!ctx) { LOG_ERROR("ggml_init() failed"); @@ -323,13 +347,13 @@ class StableDiffusionGGML { LOG_DEBUG("loading weights"); int64_t t0 = ggml_time_ms(); - std::map tensors_need_to_load; + std::map tensors_need_to_load; std::set ignore_tensors; - ggml_tensor* alphas_cumprod_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, TIMESTEPS); - calculate_alphas_cumprod((float*)alphas_cumprod_tensor->data); + ggml_tensor *alphas_cumprod_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, TIMESTEPS); + calculate_alphas_cumprod((float *) alphas_cumprod_tensor->data); tensors_need_to_load["alphas_cumprod"] = alphas_cumprod_tensor; - for (auto& pair : tensors) { - const std::string& name = pair.first; + for (auto &pair: tensors) { + const std::string &name = pair.first; if (starts_with(name, "cond_stage_model.") || starts_with(name, "first_stage_model.")) { ignore_tensors.insert(name); continue; @@ -362,6 +386,7 @@ class StableDiffusionGGML { apply_schedule(); ggml_free(ctx); + unet_path = model_path; return true; } @@ -371,9 +396,9 @@ class StableDiffusionGGML { } } - bool load_vae_from_file(const std::string& model_path, - bool standalone = true, - const std::string& prefix = "vae.") { + bool load_vae_from_file(const std::string &model_path, + bool standalone = true, + const std::string &prefix = "vae.") { if (backend == NULL) { LOG_ERROR("if you set init_backend_immediately false, please call init_backend first"); return false; @@ -410,11 +435,11 @@ class StableDiffusionGGML { } struct ggml_init_params params; - params.mem_size = static_cast(10 * 1024) * 1024; // 10M + params.mem_size = static_cast(10 * 1024) * 1024; // 3M params.mem_buffer = NULL; - params.no_alloc = false; + params.no_alloc = false; // LOG_DEBUG("mem_size %u ", params.mem_size); - struct ggml_context* ctx = ggml_init(params); // for alphas_cumprod and is_using_v_parameterization check + struct ggml_context *ctx = ggml_init(params); // for alphas_cumprod and is_using_v_parameterization check if (!ctx) { LOG_ERROR("ggml_init() failed"); return false; @@ -424,17 +449,15 @@ class StableDiffusionGGML { LOG_DEBUG("loading weights"); int64_t t0 = ggml_time_ms(); - std::map tensors_need_to_load; + std::map tensors_need_to_load; std::set ignore_tensors; - for (auto& pair : tensors) { - const std::string& name = pair.first; - // TODO: make it can reload in compute time. so we can set vae_decode_only dynamic. + for (auto &pair: tensors) { + const std::string &name = pair.first; if (vae_decode_only && (starts_with(name, "first_stage_model.encoder") || starts_with(name, "first_stage_model.quant"))) { ignore_tensors.insert(name); continue; } - tensors_need_to_load.insert(pair); } bool success = model_loader.load_tensors(tensors_need_to_load, backend, ignore_tensors, standalone); @@ -447,6 +470,7 @@ class StableDiffusionGGML { int64_t t1 = ggml_time_ms(); LOG_INFO("loading vae model from '%s' completed, taking %.2fs", model_path.c_str(), (t1 - t0) * 1.0f / 1000); ggml_free(ctx); + vae_path = model_path; return true; } @@ -457,7 +481,7 @@ class StableDiffusionGGML { } // load the all model from one file - bool load_diffusions_from_file(const std::string& model_path) { + bool load_diffusions_from_file(const std::string &model_path) { LOG_INFO("loading model from '%s'", model_path.c_str()); if (!load_clip_from_file(model_path, false, "")) { free_clip_params(); @@ -491,13 +515,15 @@ class StableDiffusionGGML { LOG_INFO("free vae params"); } - bool load_taesd_from_file(const std::string& taesd_path) { + bool load_taesd_from_file(const std::string &taesd_path) { if (first_stage_model.params_buffer_size > 0) { free_vae_params(); } if (taesd_path.empty() || !tae_first_stage.load_from_file(taesd_path, backend)) { return false; } + + this->taesd_path = taesd_path; use_tiny_autoencoder = true; return true; } @@ -508,34 +534,34 @@ class StableDiffusionGGML { } } - bool is_using_v_parameterization_for_sd2(ggml_context* work_ctx) { - struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1); + bool is_using_v_parameterization_for_sd2(ggml_context *work_ctx) { + struct ggml_tensor *x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1); ggml_set_f32(x_t, 0.5); - struct ggml_tensor* c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1); + struct ggml_tensor *c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1); ggml_set_f32(c, 0.5); - struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, + struct ggml_tensor *timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1); // [N, ] - struct ggml_tensor* t_emb = new_timestep_embedding(work_ctx, NULL, timesteps, - diffusion_model.model_channels); // [N, model_channels] + struct ggml_tensor *t_emb = new_timestep_embedding(work_ctx, NULL, timesteps, + diffusion_model.model_channels); // [N, model_channels] int64_t t0 = ggml_time_ms(); ggml_set_f32(timesteps, 999); set_timestep_embedding(timesteps, t_emb, diffusion_model.model_channels); - struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t); + struct ggml_tensor *out = ggml_dup_tensor(work_ctx, x_t); diffusion_model.alloc_compute_buffer(x_t, c, t_emb); diffusion_model.compute(out, n_threads, x_t, NULL, c, t_emb); diffusion_model.free_compute_buffer(); double result = 0.f; { - float* vec_x = (float*)x_t->data; - float* vec_out = (float*)out->data; + float *vec_x = (float *) x_t->data; + float *vec_out = (float *) out->data; int64_t n = ggml_nelements(out); for (int i = 0; i < n; i++) { - result += ((double)vec_out[i] - (double)vec_x[i]); + result += ((double) vec_out[i] - (double) vec_x[i]); } result /= n; } @@ -568,15 +594,15 @@ class StableDiffusionGGML { for (int i = 0; i < TIMESTEPS; i++) { denoiser->schedule->alphas_cumprod[i] = alphas_cumprod_tensor[i]; - denoiser->schedule->sigmas[i] = std::sqrt( - (1 - denoiser->schedule->alphas_cumprod[i]) / denoiser->schedule->alphas_cumprod[i]); + denoiser->schedule->sigmas[i] = std::sqrt( + (1 - denoiser->schedule->alphas_cumprod[i]) / denoiser->schedule->alphas_cumprod[i]); denoiser->schedule->log_sigmas[i] = std::log(denoiser->schedule->sigmas[i]); } } - void apply_lora(const std::string& lora_name, float multiplier) { - int64_t t0 = ggml_time_ms(); - std::string st_file_path = path_join(lora_model_dir, lora_name + ".safetensors"); + void apply_lora(const std::string &lora_name, float multiplier) { + int64_t t0 = ggml_time_ms(); + std::string st_file_path = path_join(lora_model_dir, lora_name + ".safetensors"); std::string ckpt_file_path = path_join(lora_model_dir, lora_name + ".ckpt"); std::string file_path; if (file_exists(st_file_path)) { @@ -606,14 +632,14 @@ class StableDiffusionGGML { (t1 - t0) * 1.0f / 1000); } - void apply_loras(const std::unordered_map& lora_state) { + void apply_loras(const std::unordered_map &lora_state) { if (lora_state.size() > 0 && model_data_type != GGML_TYPE_F16 && model_data_type != GGML_TYPE_F32) { LOG_WARN("In quantized models when applying LoRA, the images have poor quality."); } std::unordered_map lora_state_diff; - for (auto& kv : lora_state) { - const std::string& lora_name = kv.first; - float multiplier = kv.second; + for (auto &kv: lora_state) { + const std::string &lora_name = kv.first; + float multiplier = kv.second; if (curr_lora_state.find(lora_name) != curr_lora_state.end()) { float curr_multiplier = curr_lora_state[lora_name]; @@ -626,35 +652,35 @@ class StableDiffusionGGML { } } - for (auto& kv : lora_state_diff) { + for (auto &kv: lora_state_diff) { apply_lora(kv.first, kv.second); } curr_lora_state = lora_state; } - std::pair get_learned_condition(ggml_context* work_ctx, - const std::string& text, - int clip_skip, - int width, - int height, - bool force_zero_embeddings = false) { + std::pair get_learned_condition(ggml_context *work_ctx, + const std::string &text, + int clip_skip, + int width, + int height, + bool force_zero_embeddings = false) { cond_stage_model.set_clip_skip(clip_skip); - auto tokens_and_weights = cond_stage_model.tokenize(text, true); - std::vector& tokens = tokens_and_weights.first; - std::vector& weights = tokens_and_weights.second; - int64_t t0 = ggml_time_ms(); - struct ggml_tensor* pooled = NULL; - size_t total_hidden_size = cond_stage_model.text_model.hidden_size; + auto tokens_and_weights = cond_stage_model.tokenize(text, true); + std::vector &tokens = tokens_and_weights.first; + std::vector &weights = tokens_and_weights.second; + int64_t t0 = ggml_time_ms(); + struct ggml_tensor *pooled = NULL; + size_t total_hidden_size = cond_stage_model.text_model.hidden_size; if (version == VERSION_XL) { total_hidden_size += cond_stage_model.text_model2.hidden_size; pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, cond_stage_model.text_model2.projection_dim); } - struct ggml_tensor* hidden_states = ggml_new_tensor_2d(work_ctx, + struct ggml_tensor *hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, total_hidden_size, cond_stage_model.text_model.max_position_embeddings); // [N, n_token, hidden_size] - cond_stage_model.alloc_compute_buffer(work_ctx, (int)tokens.size()); + cond_stage_model.alloc_compute_buffer(work_ctx, (int) tokens.size()); cond_stage_model.compute(n_threads, tokens, hidden_states, pooled); cond_stage_model.free_compute_buffer(); // if (pooled != NULL) { @@ -664,7 +690,7 @@ class StableDiffusionGGML { int64_t t1 = ggml_time_ms(); LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); - ggml_tensor* result = ggml_dup_tensor(work_ctx, hidden_states); + ggml_tensor *result = ggml_dup_tensor(work_ctx, hidden_states); { float original_mean = ggml_tensor_mean(hidden_states); for (int i2 = 0; i2 < hidden_states->ne[2]; i2++) { @@ -680,34 +706,34 @@ class StableDiffusionGGML { ggml_tensor_scale(result, (original_mean / new_mean)); } if (force_zero_embeddings) { - float* vec = (float*)result->data; + float *vec = (float *) result->data; for (int i = 0; i < ggml_nelements(result); i++) { vec[i] = 0; } } - ggml_tensor* vec = NULL; + ggml_tensor *vec = NULL; if (version == VERSION_XL) { int out_dim = 256; - vec = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model.adm_in_channels); + vec = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model.adm_in_channels); // [0:1280] size_t offset = 0; memcpy(vec->data, pooled->data, ggml_nbytes(pooled)); offset += ggml_nbytes(pooled); - struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 2); + struct ggml_tensor *timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 2); // original_size_as_tuple - float orig_width = (float)width; - float orig_height = (float)height; + float orig_width = (float) width; + float orig_height = (float) height; ggml_tensor_set_f32(timesteps, orig_height, 0); ggml_tensor_set_f32(timesteps, orig_width, 1); - ggml_tensor* embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, + ggml_tensor *embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset); offset += ggml_nbytes(embed_view); set_timestep_embedding(timesteps, embed_view, out_dim); // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2)); // crop_coords_top_left - float crop_coord_top = 0.f; + float crop_coord_top = 0.f; float crop_coord_left = 0.f; ggml_tensor_set_f32(timesteps, crop_coord_top, 0); ggml_tensor_set_f32(timesteps, crop_coord_left, 1); @@ -716,8 +742,8 @@ class StableDiffusionGGML { set_timestep_embedding(timesteps, embed_view, out_dim); // print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2)); // target_size_as_tuple - float target_width = (float)width; - float target_height = (float)height; + float target_width = (float) width; + float target_height = (float) height; ggml_tensor_set_f32(timesteps, target_height, 0); ggml_tensor_set_f32(timesteps, target_width, 1); embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset); @@ -730,27 +756,27 @@ class StableDiffusionGGML { return {result, vec}; } - ggml_tensor* sample(ggml_context* work_ctx, - ggml_tensor* x_t, - ggml_tensor* noise, - ggml_tensor* c, - ggml_tensor* c_vector, - ggml_tensor* uc, - ggml_tensor* uc_vector, + ggml_tensor *sample(ggml_context *work_ctx, + ggml_tensor *x_t, + ggml_tensor *noise, + ggml_tensor *c, + ggml_tensor *c_vector, + ggml_tensor *uc, + ggml_tensor *uc_vector, float cfg_scale, sample_method_t method, - const std::vector& sigmas) { + const std::vector &sigmas) { size_t steps = sigmas.size() - 1; // x_t = load_tensor_from_file(work_ctx, "./rand0.bin"); // print_ggml_tensor(x_t); - struct ggml_tensor* x = ggml_dup_tensor(work_ctx, x_t); + struct ggml_tensor *x = ggml_dup_tensor(work_ctx, x_t); copy_ggml_tensor(x, x_t); - struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, x_t); - struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, - 1); // [N, ] - struct ggml_tensor* t_emb = new_timestep_embedding(work_ctx, NULL, timesteps, - diffusion_model.model_channels); // [N, model_channels] + struct ggml_tensor *noised_input = ggml_dup_tensor(work_ctx, x_t); + struct ggml_tensor *timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, + 1); // [N, ] + struct ggml_tensor *t_emb = new_timestep_embedding(work_ctx, NULL, timesteps, + diffusion_model.model_channels); // [N, model_channels] diffusion_model.alloc_compute_buffer(noised_input, c, t_emb, c_vector); bool has_unconditioned = cfg_scale != 1.0 && uc != NULL; @@ -765,31 +791,31 @@ class StableDiffusionGGML { } // denoise wrapper - struct ggml_tensor* out_cond = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor* out_uncond = NULL; + struct ggml_tensor *out_cond = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *out_uncond = NULL; if (has_unconditioned) { out_uncond = ggml_dup_tensor(work_ctx, x); } - struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *denoised = ggml_dup_tensor(work_ctx, x); - auto denoise = [&](ggml_tensor* input, float sigma, int step) { + auto denoise = [&](ggml_tensor *input, float sigma, int step) { if (step == 1) { - pretty_progress(0, (int)steps, 0); + pretty_progress(0, (int) steps, 0); } int64_t t0 = ggml_time_us(); - float c_skip = 1.0f; - float c_out = 1.0f; - float c_in = 1.0f; + float c_skip = 1.0f; + float c_out = 1.0f; + float c_in = 1.0f; std::vector scaling = denoiser->get_scalings(sigma); if (scaling.size() == 3) { // CompVisVDenoiser c_skip = scaling[0]; - c_out = scaling[1]; - c_in = scaling[2]; + c_out = scaling[1]; + c_in = scaling[2]; } else { // CompVisDenoiser c_out = scaling[0]; - c_in = scaling[1]; + c_in = scaling[1]; } float t = denoiser->schedule->sigma_to_t(sigma); @@ -803,16 +829,16 @@ class StableDiffusionGGML { // cond diffusion_model.compute(out_cond, n_threads, noised_input, NULL, c, t_emb, c_vector); - float* negative_data = NULL; + float *negative_data = NULL; if (has_unconditioned) { // uncond diffusion_model.compute(out_uncond, n_threads, noised_input, NULL, uc, t_emb, uc_vector); - negative_data = (float*)out_uncond->data; + negative_data = (float *) out_uncond->data; } - float* vec_denoised = (float*)denoised->data; - float* vec_input = (float*)input->data; - float* positive_data = (float*)out_cond->data; - int ne_elements = (int)ggml_nelements(denoised); + float *vec_denoised = (float *) denoised->data; + float *vec_input = (float *) input->data; + float *positive_data = (float *) out_cond->data; + int ne_elements = (int) ggml_nelements(denoised); for (int i = 0; i < ne_elements; i++) { float latent_result = positive_data[i]; if (has_unconditioned) { @@ -825,7 +851,7 @@ class StableDiffusionGGML { } int64_t t1 = ggml_time_us(); if (step > 0) { - pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f); + pretty_progress(step, (int) steps, (t1 - t0) / 1000000.f); // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000); } }; @@ -833,8 +859,8 @@ class StableDiffusionGGML { // sample_euler_ancestral switch (method) { case EULER_A: { - struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x); for (int i = 0; i < steps; i++) { float sigma = sigmas[i]; @@ -844,9 +870,9 @@ class StableDiffusionGGML { // d = (x - denoised) / sigma { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; + float *vec_d = (float *) d->data; + float *vec_x = (float *) x->data; + float *vec_denoised = (float *) denoised->data; for (int i = 0; i < ggml_nelements(d); i++) { vec_d[i] = (vec_x[i] - vec_denoised[i]) / sigma; @@ -854,18 +880,18 @@ class StableDiffusionGGML { } // get_ancestral_step - float sigma_up = std::min(sigmas[i + 1], - std::sqrt(sigmas[i + 1] * sigmas[i + 1] * - (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / - (sigmas[i] * sigmas[i]))); + float sigma_up = std::min(sigmas[i + 1], + std::sqrt(sigmas[i + 1] * sigmas[i + 1] * + (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / + (sigmas[i] * sigmas[i]))); float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up); // Euler method float dt = sigma_down - sigmas[i]; // x = x + d * dt { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; + float *vec_d = (float *) d->data; + float *vec_x = (float *) x->data; for (int i = 0; i < ggml_nelements(x); i++) { vec_x[i] = vec_x[i] + vec_d[i] * dt; @@ -877,8 +903,8 @@ class StableDiffusionGGML { ggml_tensor_set_f32_randn(noise, rng); // noise = load_tensor_from_file(work_ctx, "./rand" + std::to_string(i+1) + ".bin"); { - float* vec_x = (float*)x->data; - float* vec_noise = (float*)noise->data; + float *vec_x = (float *) x->data; + float *vec_noise = (float *) noise->data; for (int i = 0; i < ggml_nelements(x); i++) { vec_x[i] = vec_x[i] + vec_noise[i] * sigma_up; @@ -886,10 +912,11 @@ class StableDiffusionGGML { } } } - } break; + } + break; case EULER: // Implemented without any sigma churn { - struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x); for (int i = 0; i < steps; i++) { float sigma = sigmas[i]; @@ -899,9 +926,9 @@ class StableDiffusionGGML { // d = (x - denoised) / sigma { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; + float *vec_d = (float *) d->data; + float *vec_x = (float *) x->data; + float *vec_denoised = (float *) denoised->data; for (int j = 0; j < ggml_nelements(d); j++) { vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigma; @@ -911,18 +938,19 @@ class StableDiffusionGGML { float dt = sigmas[i + 1] - sigma; // x = x + d * dt { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; + float *vec_d = (float *) d->data; + float *vec_x = (float *) x->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_x[j] = vec_x[j] + vec_d[j] * dt; } } } - } break; + } + break; case HEUN: { - struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *x2 = ggml_dup_tensor(work_ctx, x); for (int i = 0; i < steps; i++) { // denoise @@ -930,9 +958,9 @@ class StableDiffusionGGML { // d = (x - denoised) / sigma { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; + float *vec_d = (float *) d->data; + float *vec_x = (float *) x->data; + float *vec_denoised = (float *) denoised->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i]; @@ -943,25 +971,25 @@ class StableDiffusionGGML { if (sigmas[i + 1] == 0) { // Euler step // x = x + d * dt - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; + float *vec_d = (float *) d->data; + float *vec_x = (float *) x->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_x[j] = vec_x[j] + vec_d[j] * dt; } } else { // Heun step - float* vec_d = (float*)d->data; - float* vec_d2 = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_x2 = (float*)x2->data; + float *vec_d = (float *) d->data; + float *vec_d2 = (float *) d->data; + float *vec_x = (float *) x->data; + float *vec_x2 = (float *) x2->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_x2[j] = vec_x[j] + vec_d[j] * dt; } denoise(x2, sigmas[i + 1], i + 1); - float* vec_denoised = (float*)denoised->data; + float *vec_denoised = (float *) denoised->data; for (int j = 0; j < ggml_nelements(x); j++) { float d2 = (vec_x2[j] - vec_denoised[j]) / sigmas[i + 1]; vec_d[j] = (vec_d[j] + d2) / 2; @@ -969,10 +997,11 @@ class StableDiffusionGGML { } } } - } break; + } + break; case DPM2: { - struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *x2 = ggml_dup_tensor(work_ctx, x); for (int i = 0; i < steps; i++) { // denoise @@ -980,9 +1009,9 @@ class StableDiffusionGGML { // d = (x - denoised) / sigma { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; + float *vec_d = (float *) d->data; + float *vec_x = (float *) x->data; + float *vec_denoised = (float *) denoised->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i]; @@ -992,9 +1021,9 @@ class StableDiffusionGGML { if (sigmas[i + 1] == 0) { // Euler step // x = x + d * dt - float dt = sigmas[i + 1] - sigmas[i]; - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; + float dt = sigmas[i + 1] - sigmas[i]; + float *vec_d = (float *) d->data; + float *vec_x = (float *) x->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_x[j] = vec_x[j] + vec_d[j] * dt; @@ -1002,18 +1031,18 @@ class StableDiffusionGGML { } else { // DPM-Solver-2 float sigma_mid = exp(0.5f * (log(sigmas[i]) + log(sigmas[i + 1]))); - float dt_1 = sigma_mid - sigmas[i]; - float dt_2 = sigmas[i + 1] - sigmas[i]; + float dt_1 = sigma_mid - sigmas[i]; + float dt_2 = sigmas[i + 1] - sigmas[i]; - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_x2 = (float*)x2->data; + float *vec_d = (float *) d->data; + float *vec_x = (float *) x->data; + float *vec_x2 = (float *) x2->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_x2[j] = vec_x[j] + vec_d[j] * dt_1; } denoise(x2, sigma_mid, i + 1); - float* vec_denoised = (float*)denoised->data; + float *vec_denoised = (float *) denoised->data; for (int j = 0; j < ggml_nelements(x); j++) { float d2 = (vec_x2[j] - vec_denoised[j]) / sigma_mid; vec_x[j] = vec_x[j] + d2 * dt_2; @@ -1021,30 +1050,31 @@ class StableDiffusionGGML { } } - } break; + } + break; case DPMPP2S_A: { - struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *x2 = ggml_dup_tensor(work_ctx, x); for (int i = 0; i < steps; i++) { // denoise denoise(x, sigmas[i], i + 1); // get_ancestral_step - float sigma_up = std::min(sigmas[i + 1], - std::sqrt(sigmas[i + 1] * sigmas[i + 1] * - (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / - (sigmas[i] * sigmas[i]))); + float sigma_up = std::min(sigmas[i + 1], + std::sqrt(sigmas[i + 1] * sigmas[i + 1] * + (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / + (sigmas[i] * sigmas[i]))); float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up); - auto t_fn = [](float sigma) -> float { return -log(sigma); }; - auto sigma_fn = [](float t) -> float { return exp(-t); }; + auto t_fn = [](float sigma) -> float { return -log(sigma); }; + auto sigma_fn = [](float t) -> float { return exp(-t); }; if (sigma_down == 0) { // Euler step - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; + float *vec_d = (float *) d->data; + float *vec_x = (float *) x->data; + float *vec_denoised = (float *) denoised->data; for (int j = 0; j < ggml_nelements(d); j++) { vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i]; @@ -1060,15 +1090,15 @@ class StableDiffusionGGML { } } else { // DPM-Solver++(2S) - float t = t_fn(sigmas[i]); + float t = t_fn(sigmas[i]); float t_next = t_fn(sigma_down); - float h = t_next - t; - float s = t + 0.5f * h; + float h = t_next - t; + float s = t + 0.5f * h; - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_x2 = (float*)x2->data; - float* vec_denoised = (float*)denoised->data; + float *vec_d = (float *) d->data; + float *vec_x = (float *) x->data; + float *vec_x2 = (float *) x2->data; + float *vec_denoised = (float *) denoised->data; // First half-step for (int j = 0; j < ggml_nelements(x); j++) { @@ -1087,8 +1117,8 @@ class StableDiffusionGGML { if (sigmas[i + 1] > 0) { ggml_tensor_set_f32_randn(noise, rng); { - float* vec_x = (float*)x->data; - float* vec_noise = (float*)noise->data; + float *vec_x = (float *) x->data; + float *vec_noise = (float *) noise->data; for (int i = 0; i < ggml_nelements(x); i++) { vec_x[i] = vec_x[i] + vec_noise[i] * sigma_up; @@ -1096,10 +1126,11 @@ class StableDiffusionGGML { } } } - } break; + } + break; case DPMPP2M: // DPM++ (2M) from Karras et al (2022) { - struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *old_denoised = ggml_dup_tensor(work_ctx, x); auto t_fn = [](float sigma) -> float { return -log(sigma); }; @@ -1107,14 +1138,14 @@ class StableDiffusionGGML { // denoise denoise(x, sigmas[i], i + 1); - float t = t_fn(sigmas[i]); - float t_next = t_fn(sigmas[i + 1]); - float h = t_next - t; - float a = sigmas[i + 1] / sigmas[i]; - float b = exp(-h) - 1.f; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; - float* vec_old_denoised = (float*)old_denoised->data; + float t = t_fn(sigmas[i]); + float t_next = t_fn(sigmas[i + 1]); + float h = t_next - t; + float a = sigmas[i + 1] / sigmas[i]; + float b = exp(-h) - 1.f; + float *vec_x = (float *) x->data; + float *vec_denoised = (float *) denoised->data; + float *vec_old_denoised = (float *) old_denoised->data; if (i == 0 || sigmas[i + 1] == 0) { // Simpler step for the edge cases @@ -1123,10 +1154,10 @@ class StableDiffusionGGML { } } else { float h_last = t - t_fn(sigmas[i - 1]); - float r = h_last / h; + float r = h_last / h; for (int j = 0; j < ggml_nelements(x); j++) { float denoised_d = - (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j]; + (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j]; vec_x[j] = a * vec_x[j] - b * denoised_d; } } @@ -1136,10 +1167,11 @@ class StableDiffusionGGML { vec_old_denoised[j] = vec_denoised[j]; } } - } break; + } + break; case DPMPP2Mv2: // Modified DPM++ (2M) from https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457 { - struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *old_denoised = ggml_dup_tensor(work_ctx, x); auto t_fn = [](float sigma) -> float { return -log(sigma); }; @@ -1147,13 +1179,13 @@ class StableDiffusionGGML { // denoise denoise(x, sigmas[i], i + 1); - float t = t_fn(sigmas[i]); - float t_next = t_fn(sigmas[i + 1]); - float h = t_next - t; - float a = sigmas[i + 1] / sigmas[i]; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; - float* vec_old_denoised = (float*)old_denoised->data; + float t = t_fn(sigmas[i]); + float t_next = t_fn(sigmas[i + 1]); + float h = t_next - t; + float a = sigmas[i + 1] / sigmas[i]; + float *vec_x = (float *) x->data; + float *vec_denoised = (float *) denoised->data; + float *vec_old_denoised = (float *) old_denoised->data; if (i == 0 || sigmas[i + 1] == 0) { // Simpler step for the edge cases @@ -1163,14 +1195,14 @@ class StableDiffusionGGML { } } else { float h_last = t - t_fn(sigmas[i - 1]); - float h_min = std::min(h_last, h); - float h_max = std::max(h_last, h); - float r = h_max / h_min; - float h_d = (h_max + h_min) / 2.f; - float b = exp(-h_d) - 1.f; + float h_min = std::min(h_last, h); + float h_max = std::max(h_last, h); + float r = h_max / h_min; + float h_d = (h_max + h_min) / 2.f; + float b = exp(-h_d) - 1.f; for (int j = 0; j < ggml_nelements(x); j++) { float denoised_d = - (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j]; + (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j]; vec_x[j] = a * vec_x[j] - b * denoised_d; } } @@ -1180,11 +1212,12 @@ class StableDiffusionGGML { vec_old_denoised[j] = vec_denoised[j]; } } - } break; + } + break; case LCM: // Latent Consistency Models { - struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor *d = ggml_dup_tensor(work_ctx, x); for (int i = 0; i < steps; i++) { float sigma = sigmas[i]; @@ -1194,8 +1227,8 @@ class StableDiffusionGGML { // x = denoised { - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; + float *vec_x = (float *) x->data; + float *vec_denoised = (float *) denoised->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_x[j] = vec_denoised[j]; } @@ -1206,8 +1239,8 @@ class StableDiffusionGGML { ggml_tensor_set_f32_randn(noise, rng); // noise = load_tensor_from_file(res_ctx, "./rand" + std::to_string(i+1) + ".bin"); { - float* vec_x = (float*)x->data; - float* vec_noise = (float*)noise->data; + float *vec_x = (float *) x->data; + float *vec_noise = (float *) noise->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_x[j] = vec_x[j] + sigmas[i + 1] * vec_noise[j]; @@ -1215,7 +1248,8 @@ class StableDiffusionGGML { } } } - } break; + } + break; default: LOG_ERROR("Attempting to sample with nonexisting sample method %i", method); @@ -1226,28 +1260,28 @@ class StableDiffusionGGML { } // ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding - ggml_tensor* get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) { + ggml_tensor *get_first_stage_encoding(ggml_context *work_ctx, ggml_tensor *moments) { // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample - ggml_tensor* latent = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], - moments->ne[2] / 2, moments->ne[3]); - struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latent); + ggml_tensor *latent = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], + moments->ne[2] / 2, moments->ne[3]); + struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, latent); ggml_tensor_set_f32_randn(noise, rng); // noise = load_tensor_from_file(work_ctx, "noise.bin"); { - float mean = 0; + float mean = 0; float logvar = 0; - float value = 0; - float std_ = 0; + float value = 0; + float std_ = 0; for (int i = 0; i < latent->ne[3]; i++) { for (int j = 0; j < latent->ne[2]; j++) { for (int k = 0; k < latent->ne[1]; k++) { for (int l = 0; l < latent->ne[0]; l++) { - mean = ggml_tensor_get_f32(moments, l, k, j, i); - logvar = ggml_tensor_get_f32(moments, l, k, j + (int)latent->ne[2], i); + mean = ggml_tensor_get_f32(moments, l, k, j, i); + logvar = ggml_tensor_get_f32(moments, l, k, j + (int) latent->ne[2], i); logvar = std::max(-30.0f, std::min(logvar, 20.0f)); - std_ = std::exp(0.5f * logvar); - value = mean + std_ * ggml_tensor_get_f32(noise, l, k, j, i); - value = value * scale_factor; + std_ = std::exp(0.5f * logvar); + value = mean + std_ * ggml_tensor_get_f32(noise, l, k, j, i); + value = value * scale_factor; // printf("%d %d %d %d -> %f\n", i, j, k, l, value); ggml_tensor_set_f32(latent, value, l, k, j, i); } @@ -1258,14 +1292,14 @@ class StableDiffusionGGML { return latent; } - ggml_tensor* compute_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode) { - int64_t W = x->ne[0]; - int64_t H = x->ne[1]; - ggml_tensor* result = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, + ggml_tensor *compute_first_stage(ggml_context *work_ctx, ggml_tensor *x, bool decode) { + int64_t W = x->ne[0]; + int64_t H = x->ne[1]; + ggml_tensor *result = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, decode ? (W * 8) : (W / 8), // width decode ? (H * 8) : (H / 8), // height decode ? 3 : (use_tiny_autoencoder ? 4 : 8)); // channels - int64_t t0 = ggml_time_ms(); + int64_t t0 = ggml_time_ms(); if (!use_tiny_autoencoder) { if (decode) { ggml_tensor_scale(x, 1.0f / scale_factor); @@ -1274,7 +1308,7 @@ class StableDiffusionGGML { } if (vae_tiling && decode) { // TODO: support tiling vae encode // split latent in 32x32 tiles and compute in several steps - auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { + auto on_tiling = [&](ggml_tensor *in, ggml_tensor *out, bool init) { if (init) { first_stage_model.alloc_compute_buffer(in, decode); } else { @@ -1293,7 +1327,7 @@ class StableDiffusionGGML { } else { if (vae_tiling && decode) { // TODO: support tiling vae encode // split latent in 64x64 tiles and compute in several steps - auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { + auto on_tiling = [&](ggml_tensor *in, ggml_tensor *out, bool init) { if (init) { tae_first_stage.alloc_compute_buffer(in, decode); } else { @@ -1316,11 +1350,11 @@ class StableDiffusionGGML { return result; } - ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x) { + ggml_tensor *encode_first_stage(ggml_context *work_ctx, ggml_tensor *x) { return compute_first_stage(work_ctx, x, false); } - ggml_tensor* decode_first_stage(ggml_context* work_ctx, ggml_tensor* x) { + ggml_tensor *decode_first_stage(ggml_context *work_ctx, ggml_tensor *x) { return compute_first_stage(work_ctx, x, true); } }; @@ -1328,19 +1362,19 @@ class StableDiffusionGGML { /*================================================= SD API ==================================================*/ struct sd_ctx_t { - StableDiffusionGGML* sd = NULL; + StableDiffusionGGML *sd = NULL; }; -sd_ctx_t* new_sd_ctx(int n_threads, +sd_ctx_t *new_sd_ctx(int n_threads, bool vae_decode_only, bool free_params_immediately, - const char* lora_model_dir_c_str, + const char *lora_model_dir_c_str, enum rng_type_t rng_type, bool vae_tiling, enum sd_type_t wtype, enum schedule_t s, bool init_backend_immediately) { - sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t)); + sd_ctx_t *sd_ctx = (sd_ctx_t *) malloc(sizeof(sd_ctx_t)); if (sd_ctx == NULL) { return NULL; } @@ -1358,7 +1392,7 @@ sd_ctx_t* new_sd_ctx(int n_threads, return sd_ctx; } -void free_sd_ctx(sd_ctx_t* sd_ctx) { +void free_sd_ctx(sd_ctx_t *sd_ctx) { if (sd_ctx->sd != NULL) { delete sd_ctx->sd; sd_ctx->sd = NULL; @@ -1366,7 +1400,7 @@ void free_sd_ctx(sd_ctx_t* sd_ctx) { free(sd_ctx); } -void init_backend(sd_ctx_t* sd_ctx) { +void init_backend(sd_ctx_t *sd_ctx) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return; @@ -1374,11 +1408,11 @@ void init_backend(sd_ctx_t* sd_ctx) { sd_ctx->sd->init_backend(); } -void set_options(sd_ctx_t* sd_ctx, +void set_options(sd_ctx_t *sd_ctx, int n_threads, bool vae_decode_only, bool free_params_immediately, - const char* lora_model_dir, + const char *lora_model_dir, rng_type_t rng_type, bool vae_tiling, sd_type_t wtype, @@ -1388,17 +1422,17 @@ void set_options(sd_ctx_t* sd_ctx, return; } sd_ctx->sd->set_options( - n_threads, - vae_decode_only, - free_params_immediately, - std::string(lora_model_dir), - rng_type, - vae_tiling, - wtype, - schedule); + n_threads, + vae_decode_only, + free_params_immediately, + std::string(lora_model_dir), + rng_type, + vae_tiling, + wtype, + schedule); } -bool load_clip_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* prefix) { +bool load_clip_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return false; @@ -1406,7 +1440,7 @@ bool load_clip_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* p return sd_ctx->sd->load_clip_from_file(std::string(model_path), true, std::string(prefix)); } -void free_clip_params(sd_ctx_t* sd_ctx) { +void free_clip_params(sd_ctx_t *sd_ctx) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return; @@ -1414,7 +1448,7 @@ void free_clip_params(sd_ctx_t* sd_ctx) { sd_ctx->sd->free_clip_params(); } -bool load_unet_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* prefix) { +bool load_unet_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return false; @@ -1422,7 +1456,7 @@ bool load_unet_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* p return sd_ctx->sd->load_unet_from_file(std::string(model_path), true, std::string(prefix)); } -void free_unet_params(sd_ctx_t* sd_ctx) { +void free_unet_params(sd_ctx_t *sd_ctx) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return; @@ -1430,7 +1464,7 @@ void free_unet_params(sd_ctx_t* sd_ctx) { sd_ctx->sd->free_unet_params(); } -bool load_vae_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* prefix) { +bool load_vae_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return false; @@ -1438,7 +1472,7 @@ bool load_vae_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* pr return sd_ctx->sd->load_vae_from_file(std::string(model_path), true, std::string(prefix)); } -void free_vae_params(sd_ctx_t* sd_ctx) { +void free_vae_params(sd_ctx_t *sd_ctx) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return; @@ -1446,7 +1480,7 @@ void free_vae_params(sd_ctx_t* sd_ctx) { sd_ctx->sd->free_vae_params(); } -bool load_taesd_from_file(sd_ctx_t* sd_ctx, const char* model_path) { +bool load_taesd_from_file(sd_ctx_t *sd_ctx, const char *model_path) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return false; @@ -1454,7 +1488,7 @@ bool load_taesd_from_file(sd_ctx_t* sd_ctx, const char* model_path) { return sd_ctx->sd->load_taesd_from_file(std::string(model_path)); } -void free_taesd_params(sd_ctx_t* sd_ctx) { +void free_taesd_params(sd_ctx_t *sd_ctx) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return; @@ -1463,7 +1497,7 @@ void free_taesd_params(sd_ctx_t* sd_ctx) { } // load all model from one file -bool load_diffusions_from_file(sd_ctx_t* sd_ctx, const char* model_path) { +bool load_diffusions_from_file(sd_ctx_t *sd_ctx, const char *model_path) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return false; @@ -1472,7 +1506,7 @@ bool load_diffusions_from_file(sd_ctx_t* sd_ctx, const char* model_path) { } // free all model from one file -void free_diffusions_params(sd_ctx_t* sd_ctx) { +void free_diffusions_params(sd_ctx_t *sd_ctx) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return; @@ -1480,9 +1514,9 @@ void free_diffusions_params(sd_ctx_t* sd_ctx) { return sd_ctx->sd->free_diffusions_params(); } -sd_image_t* txt2img(sd_ctx_t* sd_ctx, - const char* prompt_c_str, - const char* negative_prompt_c_str, +sd_image_t *txt2img(sd_ctx_t *sd_ctx, + const char *prompt_c_str, + const char *negative_prompt_c_str, int clip_skip, float cfg_scale, int width, @@ -1500,10 +1534,10 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, std::string negative_prompt(negative_prompt_c_str); // extract and remove lora - auto result_pair = extract_and_remove_lora(prompt); + auto result_pair = extract_and_remove_lora(prompt); std::unordered_map lora_f2m = result_pair.first; // lora_name -> multiplier - for (auto& kv : lora_f2m) { + for (auto &kv: lora_f2m) { LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second); } @@ -1519,10 +1553,10 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, params.mem_size += width * height * 3 * sizeof(float); params.mem_size *= batch_count; params.mem_buffer = NULL; - params.no_alloc = false; + params.no_alloc = false; // LOG_DEBUG("mem_size %u ", params.mem_size); - struct ggml_context* work_ctx = ggml_init(params); + struct ggml_context *work_ctx = ggml_init(params); if (!work_ctx) { LOG_ERROR("ggml_init() failed"); return NULL; @@ -1532,16 +1566,16 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, // Generally, when using the provided command line, the seed is always >0. // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library // by a third party with a seed <0, let's incorporate randomization here. - srand((int)time(NULL)); + srand((int) time(NULL)); seed = rand(); } - t0 = ggml_time_ms(); - auto cond_pair = sd_ctx->sd->get_learned_condition(work_ctx, prompt, clip_skip, width, height); - ggml_tensor* c = cond_pair.first; - ggml_tensor* c_vector = cond_pair.second; // [adm_in_channels, ] - struct ggml_tensor* uc = NULL; - struct ggml_tensor* uc_vector = NULL; + t0 = ggml_time_ms(); + auto cond_pair = sd_ctx->sd->get_learned_condition(work_ctx, prompt, clip_skip, width, height); + ggml_tensor *c = cond_pair.first; + ggml_tensor *c_vector = cond_pair.second; // [adm_in_channels, ] + struct ggml_tensor *uc = NULL; + struct ggml_tensor *uc_vector = NULL; if (cfg_scale != 1.0) { bool force_zero_embeddings = false; if (sd_ctx->sd->version == VERSION_XL && negative_prompt.size() == 0) { @@ -1549,8 +1583,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, } auto uncond_pair = sd_ctx->sd->get_learned_condition(work_ctx, negative_prompt, clip_skip, width, height, force_zero_embeddings); - uc = uncond_pair.first; - uc_vector = uncond_pair.second; // [adm_in_channels, ] + uc = uncond_pair.first; + uc_vector = uncond_pair.second; // [adm_in_channels, ] } t1 = ggml_time_ms(); LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t1 - t0); @@ -1559,23 +1593,23 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, sd_ctx->sd->cond_stage_model.free_params_buffer(); } - std::vector final_latents; // collect latents to decode + std::vector final_latents; // collect latents to decode int C = 4; int W = width / 8; int H = height / 8; LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]); for (int b = 0; b < batch_count; b++) { int64_t sampling_start = ggml_time_ms(); - int64_t cur_seed = seed + b; + int64_t cur_seed = seed + b; LOG_INFO("generating image: %i/%i - seed %i", b + 1, batch_count, cur_seed); sd_ctx->sd->rng->manual_seed(cur_seed); - struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); + struct ggml_tensor *x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); ggml_tensor_set_f32_randn(x_t, sd_ctx->sd->rng); std::vector sigmas = sd_ctx->sd->denoiser->schedule->get_sigmas(sample_steps); - struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx, x_t, NULL, c, c_vector, uc, uc_vector, cfg_scale, + struct ggml_tensor *x_0 = sd_ctx->sd->sample(work_ctx, x_t, NULL, c, c_vector, uc, uc_vector, cfg_scale, sample_method, sigmas); // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin"); // print_ggml_tensor(x_0); @@ -1592,10 +1626,10 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, (t3 - t1) * 1.0f / 1000); LOG_INFO("decoding %zu latents", final_latents.size()); - std::vector decoded_images; // collect decoded images + std::vector decoded_images; // collect decoded images for (size_t i = 0; i < final_latents.size(); i++) { - t1 = ggml_time_ms(); - struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, final_latents[i] /* x_0 */); + t1 = ggml_time_ms(); + struct ggml_tensor *img = sd_ctx->sd->decode_first_stage(work_ctx, final_latents[i] /* x_0 */); // print_ggml_tensor(img); if (img != NULL) { decoded_images.push_back(img); @@ -1609,30 +1643,30 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, if (sd_ctx->sd->free_params_immediately && !sd_ctx->sd->use_tiny_autoencoder) { sd_ctx->sd->first_stage_model.free_params_buffer(); } - sd_image_t* result_images = (sd_image_t*)calloc(batch_count, sizeof(sd_image_t)); + sd_image_t *result_images = (sd_image_t *) calloc(batch_count, sizeof(sd_image_t)); if (result_images == NULL) { ggml_free(work_ctx); return NULL; } for (size_t i = 0; i < decoded_images.size(); i++) { - result_images[i].width = width; - result_images[i].height = height; + result_images[i].width = width; + result_images[i].height = height; result_images[i].channel = 3; - result_images[i].data = sd_tensor_to_image(decoded_images[i]); + result_images[i].data = sd_tensor_to_image(decoded_images[i]); } ggml_free(work_ctx); LOG_INFO( - "txt2img completed in %.2fs", - (t4 - t0) * 1.0f / 1000); + "txt2img completed in %.2fs", + (t4 - t0) * 1.0f / 1000); return result_images; } -sd_image_t* img2img(sd_ctx_t* sd_ctx, +sd_image_t *img2img(sd_ctx_t *sd_ctx, sd_image_t init_image, - const char* prompt_c_str, - const char* negative_prompt_c_str, + const char *prompt_c_str, + const char *negative_prompt_c_str, int clip_skip, float cfg_scale, int width, @@ -1651,7 +1685,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, LOG_INFO("img2img %dx%d", width, height); std::vector sigmas = sd_ctx->sd->denoiser->schedule->get_sigmas(sample_steps); - size_t t_enc = static_cast(sample_steps * strength); + size_t t_enc = static_cast(sample_steps * strength); LOG_INFO("target t_enc is %zu steps", t_enc); std::vector sigma_sched; sigma_sched.assign(sigmas.begin() + sample_steps - t_enc - 1, sigmas.end()); @@ -1660,26 +1694,26 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, params.mem_size = static_cast(10 * 1024) * 1024; // 10 MB params.mem_size += width * height * 3 * sizeof(float) * 2; params.mem_buffer = NULL; - params.no_alloc = false; + params.no_alloc = false; // LOG_DEBUG("mem_size %u ", params.mem_size); // draft context - struct ggml_context* work_ctx = ggml_init(params); + struct ggml_context *work_ctx = ggml_init(params); if (!work_ctx) { LOG_ERROR("ggml_init() failed"); return NULL; } if (seed < 0) { - seed = (int)time(NULL); + seed = (int) time(NULL); } sd_ctx->sd->rng->manual_seed(seed); // extract and remove lora - auto result_pair = extract_and_remove_lora(prompt); + auto result_pair = extract_and_remove_lora(prompt); std::unordered_map lora_f2m = result_pair.first; // lora_name -> multiplier - for (auto& kv : lora_f2m) { + for (auto &kv: lora_f2m) { LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second); } prompt = result_pair.second; @@ -1691,13 +1725,13 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, int64_t t1 = ggml_time_ms(); LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); - ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); + ggml_tensor *init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); sd_image_to_tensor(init_image.data, init_img); - t0 = ggml_time_ms(); - ggml_tensor* init_latent = NULL; + t0 = ggml_time_ms(); + ggml_tensor *init_latent = NULL; if (!sd_ctx->sd->use_tiny_autoencoder) { - ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img); - init_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); + ggml_tensor *moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img); + init_latent = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); } else { init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img); } @@ -1705,11 +1739,11 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, t1 = ggml_time_ms(); LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); - auto cond_pair = sd_ctx->sd->get_learned_condition(work_ctx, prompt, clip_skip, width, height); - ggml_tensor* c = cond_pair.first; - ggml_tensor* c_vector = cond_pair.second; // [adm_in_channels, ] - struct ggml_tensor* uc = NULL; - struct ggml_tensor* uc_vector = NULL; + auto cond_pair = sd_ctx->sd->get_learned_condition(work_ctx, prompt, clip_skip, width, height); + ggml_tensor *c = cond_pair.first; + ggml_tensor *c_vector = cond_pair.second; // [adm_in_channels, ] + struct ggml_tensor *uc = NULL; + struct ggml_tensor *uc_vector = NULL; if (cfg_scale != 1.0) { bool force_zero_embeddings = false; if (sd_ctx->sd->version == VERSION_XL && negative_prompt.size() == 0) { @@ -1717,8 +1751,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, } auto uncond_pair = sd_ctx->sd->get_learned_condition(work_ctx, negative_prompt, clip_skip, width, height, force_zero_embeddings); - uc = uncond_pair.first; - uc_vector = uncond_pair.second; // [adm_in_channels, ] + uc = uncond_pair.first; + uc_vector = uncond_pair.second; // [adm_in_channels, ] } int64_t t2 = ggml_time_ms(); LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t2 - t1); @@ -1727,11 +1761,11 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, } sd_ctx->sd->rng->manual_seed(seed); - struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, init_latent); + struct ggml_tensor *noise = ggml_dup_tensor(work_ctx, init_latent); ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng); LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]); - struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx, init_latent, noise, c, c_vector, uc, uc_vector, + struct ggml_tensor *x_0 = sd_ctx->sd->sample(work_ctx, init_latent, noise, c, c_vector, uc, uc_vector, cfg_scale, sample_method, sigma_sched); // struct ggml_tensor *x_0 = load_tensor_from_file(ctx, "samples_ddim.bin"); // print_ggml_tensor(x_0); @@ -1741,7 +1775,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, sd_ctx->sd->diffusion_model.free_params_buffer(); } - struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, x_0); + struct ggml_tensor *img = sd_ctx->sd->decode_first_stage(work_ctx, x_0); if (sd_ctx->sd->free_params_immediately && !sd_ctx->sd->use_tiny_autoencoder) { sd_ctx->sd->first_stage_model.free_params_buffer(); } @@ -1750,17 +1784,17 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, return NULL; } - sd_image_t* result_images = (sd_image_t*)calloc(1, sizeof(sd_image_t)); + sd_image_t *result_images = (sd_image_t *) calloc(1, sizeof(sd_image_t)); if (result_images == NULL) { ggml_free(work_ctx); return NULL; } for (size_t i = 0; i < 1; i++) { - result_images[i].width = width; - result_images[i].height = height; + result_images[i].width = width; + result_images[i].height = height; result_images[i].channel = 3; - result_images[i].data = sd_tensor_to_image(img); + result_images[i].data = sd_tensor_to_image(img); } ggml_free(work_ctx); From 18aec69789427eee04a1d5b55f43d0d5d9da2734 Mon Sep 17 00:00:00 2001 From: Cyberhan123 <255542417@qq.com> Date: Fri, 26 Jan 2024 18:02:16 +0800 Subject: [PATCH 4/8] full ci --todo reload model --- examples/cli/main.cpp | 329 ++++++++++++++++++++++++++++++++++-------- stable-diffusion.cpp | 24 ++- 2 files changed, 292 insertions(+), 61 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 31893751..6ab638e5 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include "stable-diffusion.h" @@ -408,67 +409,88 @@ void parse_args(int argc, const char **argv, SDParams ¶ms) { print_usage(argc, argv); exit(1); } +} + +bool check_params(SDParams params) { + std::vector required_args; + std::vector invalid_args; if (params.n_threads <= 0) { params.n_threads = get_num_physical_cores(); } - if (params.mode != STREAM) { - if (params.mode != CONVERT && params.prompt.length() == 0) { - fprintf(stderr, "error: the following arguments are required: prompt\n"); - print_usage(argc, argv); - exit(1); - } + if (params.mode != CONVERT && params.prompt.length() == 0) { + required_args.emplace_back("prompt"); + } - if (params.model_path.length() == 0) { - fprintf(stderr, "error: the following arguments are required: model_path\n"); - print_usage(argc, argv); - exit(1); - } + if (params.model_path.length() == 0) { + required_args.emplace_back("model_path"); + } - if (params.mode == IMG2IMG && params.input_path.length() == 0) { - fprintf(stderr, "error: when using the img2img mode, the following arguments are required: init-img\n"); - print_usage(argc, argv); - exit(1); - } + if (params.mode == IMG2IMG && params.input_path.length() == 0) { + required_args.emplace_back("init-img"); + } - if (params.output_path.length() == 0) { - fprintf(stderr, "error: the following arguments are required: output_path\n"); - print_usage(argc, argv); - exit(1); - } + if (params.output_path.length() == 0) { + required_args.emplace_back("output_path"); + } - if (params.width <= 0 || params.width % 64 != 0) { - fprintf(stderr, "error: the width must be a multiple of 64\n"); - exit(1); - } + if (params.width <= 0 || params.width % 64 != 0) { + invalid_args.emplace_back("the width must be a multiple of 64"); + } - if (params.height <= 0 || params.height % 64 != 0) { - fprintf(stderr, "error: the height must be a multiple of 64\n"); - exit(1); - } + if (params.height <= 0 || params.height % 64 != 0) { + invalid_args.emplace_back("the height must be a multiple of 64"); + } - if (params.sample_steps <= 0) { - fprintf(stderr, "error: the sample_steps must be greater than 0\n"); - exit(1); - } + if (params.sample_steps <= 0) { + invalid_args.emplace_back("the sample_steps must be greater than 0"); + } - if (params.strength < 0.f || params.strength > 1.f) { - fprintf(stderr, "error: can only work with strength in [0.0, 1.0]\n"); - exit(1); + if (params.strength < 0.f || params.strength > 1.f) { + invalid_args.emplace_back("can only work with strength in [0.0, 1.0]"); + } + + if (params.seed < 0) { + srand((int) time(NULL)); + params.seed = rand(); + } + + if (params.mode == CONVERT) { + if (params.output_path == "output.png") { + params.output_path = "output.gguf"; } + } - if (params.seed < 0) { - srand((int) time(NULL)); - params.seed = rand(); + if ((!invalid_args.empty()) || (!required_args.empty())) { + if (!invalid_args.empty()) { + std::ostringstream oss; + for (int i = 0; i < invalid_args.size(); i++) { + if (i > 0) { + oss << ",\n"; + } + oss << invalid_args[i]; + } + std::string invalid_args_str = oss.str(); + std::cout << "error: " << invalid_args_str << std::endl; } - if (params.mode == CONVERT) { - if (params.output_path == "output.png") { - params.output_path = "output.gguf"; + if (!required_args.empty()) { + std::ostringstream oss; + for (int i = 0; i < required_args.size(); i++) { + if (i > 0) { + oss << ","; + } + oss << required_args[i]; } + std::string required_args_str = oss.str(); + std::cout << "require: " << required_args_str << std::endl; } + + return false; } + + return true; } std::string get_image_params(SDParams params, int64_t seed) { @@ -505,20 +527,38 @@ void sd_log_cb(enum sd_log_level_t level, const char *log, void *data) { } } -std::vector parse_cin(std::string &input, std::vector ignore_args) { +std::vector parse_cin(std::string &input, std::set ignore_args) { std::vector inputTokens; std::string token; std::istringstream iss(input); std::string word; + bool in_stmt = false; + std::string stmt; + inputTokens.emplace_back("fake run path, no use!"); while (iss >> word) { + if (word[0] == '"') { + in_stmt = true; + } + + if (word[word.length() - 1] == '"') { + stmt += word; + word = stmt.substr(1, stmt.length() - 2); + stmt = ""; + in_stmt = false; + } + + if (in_stmt) { + stmt += word; + stmt += " "; + continue; + } inputTokens.push_back(word); } std::vector commands; for (int i = 0; i < inputTokens.size(); i++) { - - if (std::find(ignore_args.begin(), ignore_args.end(), inputTokens[i]) != ignore_args.end()) { + if (ignore_args.find(inputTokens[i]) != ignore_args.end()) { i++; continue; } @@ -527,6 +567,128 @@ std::vector parse_cin(std::string &input, std::vector return commands; } +SDParams merge_params(SDParams dst, SDParams src) { + if (dst.n_threads != src.n_threads) { + if (src.n_threads > 0) { + dst.n_threads = src.n_threads; + } + } + + if (dst.mode != src.mode) { + if (src.mode == TXT2IMG || src.mode == IMG2IMG) { + dst.mode = src.mode; + if (dst.mode == IMG2IMG) { + dst.vae_decode_only = false; + } + } + } + + if (dst.model_path != src.model_path) { + if (!src.model_path.empty()) { + dst.model_path = src.model_path; + } + } + + if (dst.vae_path != src.vae_path) { + if (!src.vae_path.empty()) { + dst.vae_path = src.vae_path; + } + } + + if (dst.clip_path != src.clip_path) { + if (!src.clip_path.empty()) { + dst.clip_path = src.clip_path; + } + } + + if (dst.unet_path != src.unet_path) { + if (!src.unet_path.empty()) { + dst.unet_path = src.unet_path; + } + } + + if (dst.taesd_path != src.taesd_path) { + if (!src.taesd_path.empty()) { + dst.taesd_path = src.taesd_path; + } + } + + if (dst.esrgan_path != src.esrgan_path) { + if (!src.esrgan_path.empty()) { + dst.esrgan_path = src.esrgan_path; + } + } + + if (dst.wtype != src.wtype) { + dst.wtype = src.wtype; + } + + if (dst.lora_model_dir != src.lora_model_dir) { + if (!src.lora_model_dir.empty()) { + dst.lora_model_dir = src.lora_model_dir; + } + } + + if (dst.output_path != src.output_path) { + if (!src.output_path.empty()) { + dst.output_path = src.output_path; + } + } + + if (dst.prompt != src.prompt) { + if (!src.prompt.empty()) { + dst.prompt = src.prompt; + } + } + + if (dst.negative_prompt != src.negative_prompt) { + if (!src.negative_prompt.empty()) { + dst.negative_prompt = src.negative_prompt; + } + } + + if (dst.cfg_scale != src.cfg_scale) { + if (src.cfg_scale >= 0) { + dst.cfg_scale = src.cfg_scale; + } + } + + if (dst.clip_skip != src.clip_skip) { + dst.clip_skip = src.clip_skip; + } + + if (dst.width != src.width) { + if (src.width > 0 || src.width % 64 == 0) { + dst.width = src.width; + } + } + + if (dst.height != src.height) { + if (src.height > 0 || src.height % 64 == 0) { + dst.height = src.height; + } + } + + if (dst.sample_steps != src.sample_steps) { + if (src.sample_steps > 0) { + dst.sample_steps = src.sample_steps; + } + } + + if (dst.strength != src.strength) { + if (src.strength >= 0.f && src.strength <= 1.f) { + dst.strength = src.strength; + } + } + + if (dst.seed != src.seed) { + if (src.seed > 0) { + dst.seed = src.seed; + } + } + return dst; +} + class CliInstance { public: sd_ctx_t *sd_ctx; @@ -548,7 +710,28 @@ class CliInstance { true); } - //TODO: dynamic load model + bool load_from_file(SDParams ¶ms) { + // free api always check if the following methods can free, so we can always free the model before load it. + free_diffusions_params(sd_ctx); + auto load_status = load_diffusions_from_file(sd_ctx, params.model_path.c_str()); + + if (load_status && !params.clip_path.empty()) { + free_clip_params(sd_ctx); + load_status = load_clip_from_file(sd_ctx, params.clip_path.c_str()); + } + + if (load_status && !params.vae_path.empty()) { + free_vae_params(sd_ctx); + load_status = load_vae_from_file(sd_ctx, params.vae_path.c_str()); + } + + if (load_status && !params.unet_path.empty()) { + free_unet_params(sd_ctx); + load_status = load_unet_from_file(sd_ctx, params.unet_path.c_str()); + } + + return load_status; + } void txtimg(SDParams ¶ms) { set_options(sd_ctx, params.n_threads, @@ -682,8 +865,13 @@ class CliInstance { int main(int argc, const char *argv[]) { SDParams params; + parse_args(argc, argv, params); + if (params.mode != STREAM && !check_params(params)) { + return 1; + } + sd_set_log_callback(sd_log_cb, (void *) ¶ms); if (params.verbose) { @@ -713,34 +901,61 @@ int main(int argc, const char *argv[]) { } auto instance = new CliInstance(params); + if (params.mode == STREAM) { + std::cout << "you are in stream model, feel free to use txt2img or img2img" << std::endl; while (true) { - std::cout << "you are in stream model, take free to use txt2img or img2img" << std::endl; std::string input; + std::cout << "please input args: " << std::endl; std::getline(std::cin, input); - std::vector ignore_cmd = {""}; - auto args = parse_cin(input, ignore_cmd); + //hold an ignore cmd for feature to ignore the cmd not support + std::set ignore_cmd = {""}; + std::vector args = parse_cin(input, ignore_cmd); SDParams stream_params; const char **args_c_arr = new const char *[args.size()]; - for (int i = 0; i < args.size(); ++i) { - args_c_arr[i] = args[i].c_str(); + for (int i = 0; i < args.size(); i++) { + std::string arg = args[i]; + char *c_str = new char[args[i].length() + 1]; + std::strcpy(c_str, arg.c_str()); + args_c_arr[i] = c_str; } parse_args(args.size(), args_c_arr, stream_params); - if (stream_params.mode == TXT2IMG) { - instance->txtimg(stream_params); - } else if (stream_params.mode == IMG2IMG) { - instance->imgimg(stream_params); + if (params.model_path != stream_params.model_path || + params.clip_path != stream_params.clip_path || + params.vae_path != stream_params.vae_path || + params.unet_path != stream_params.unet_path) { + instance->load_from_file(stream_params); + } + params = merge_params(params, stream_params); + if (!check_params(params)) { + continue; + } + if (params.mode == TXT2IMG) { + instance->txtimg(params); + } else if (params.mode == IMG2IMG) { + instance->imgimg(params); } else { - exit(1); + return 1; } } } else { + if (!params.model_path.empty()) { + if (!instance->load_from_file(params)) { + return 1; + } + } else { + if (!params.clip_path.empty() && !params.vae_path.empty() && !params.unet_path.empty()) { + if (!instance->load_from_file(params)) { + return 1; + } + } + } if (params.mode == TXT2IMG) { instance->txtimg(params); } else if (params.mode == IMG2IMG) { instance->imgimg(params); } else { - exit(1); + return 0; } } return 0; diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index e3090803..5ead3c0b 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -82,6 +82,7 @@ class StableDiffusionGGML { TinyAutoEncoder tae_first_stage; + std::string model_path; std::string clip_path; std::string vae_path; std::string unet_path; @@ -156,7 +157,17 @@ class StableDiffusionGGML { sd_type_t wtype, schedule_t schedule) { this->n_threads = n_threads; - bool standalone=vae_path != clip_path && vae_path != unet_path; + bool standalone = clip_path != vae_path && vae_path != unet_path; + + std::string model_path; + if (!standalone && clip_path == vae_path) { + model_path = clip_path; + } + + if (!standalone && vae_path == unet_path) { + model_path = vae_path; + } + if (this->vae_decode_only != vae_decode_only) { this->vae_decode_only = vae_decode_only; if (!vae_path.empty() && first_stage_model.params_buffer_size > 0) { @@ -178,12 +189,17 @@ class StableDiffusionGGML { } this->vae_tiling = vae_tiling; - if (this->wtype !=(ggml_type) wtype) { + if (this->wtype != (ggml_type) wtype) { this->wtype = (ggml_type) wtype; - // TODO: change wtype, need reload model + // TODO: can reload weight +// if (!standalone) { +// free_diffusions_params(); +// load_diffusions_from_file(model_path); +// } + } - if (this->schedule!=schedule){ + if (this->schedule != schedule) { this->schedule = schedule; apply_schedule(); } From 525f54b8710b20693aa7899279210e7e2006d5be Mon Sep 17 00:00:00 2001 From: Cyberhan123 <255542417@qq.com> Date: Sun, 4 Feb 2024 11:08:06 +0800 Subject: [PATCH 5/8] format code --- common.hpp | 2 +- examples/cli/main.cpp | 95 ++++++++++++------------ stable-diffusion.cpp | 164 +++++++++++++++++++++--------------------- stable-diffusion.h | 12 ++-- 4 files changed, 134 insertions(+), 139 deletions(-) diff --git a/common.hpp b/common.hpp index 4a423d5a..b79a3c92 100644 --- a/common.hpp +++ b/common.hpp @@ -465,7 +465,7 @@ struct SpatialTransformer { #if defined(SD_USE_FLASH_ATTENTION) && !defined(SD_USE_CUBLAS) && !defined(SD_USE_METAL) struct ggml_tensor* kqv = ggml_flash_attn(ctx, q, k, v, false); // [N * n_head, h * w, d_head] #else - struct ggml_tensor* kq = ggml_mul_mat(ctx, k, q); // [N * n_head, h * w, max_position] + struct ggml_tensor* kq = ggml_mul_mat(ctx, k, q); // [N * n_head, h * w, max_position] // kq = ggml_diag_mask_inf_inplace(ctx, kq, 0); kq = ggml_soft_max_inplace(ctx, kq); diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index fa393489..0ab66e70 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -3,10 +3,10 @@ #include #include #include +#include +#include #include #include -#include -#include #include "preprocessing.hpp" #include "stable-diffusion.h" @@ -20,9 +20,9 @@ #include "stb_image_write.h" -const char *rng_type_to_str[] = { - "std_default", - "cuda", +const char* rng_type_to_str[] = { + "std_default", + "cuda", }; // Names of the sampler method, same order as enum sample_method in stable-diffusion.h @@ -93,7 +93,7 @@ struct SDParams { int64_t seed = 42; bool verbose = false; bool vae_tiling = false; - bool vae_decode_only = false; + bool vae_decode_only = false; bool control_net_cpu = false; bool canny_preprocess = false; }; @@ -190,7 +190,7 @@ void print_usage(int argc, const char* argv[]) { printf(" -v, --verbose print extra info\n"); } -void parse_args(int argc, const char **argv, SDParams ¶ms) { +void parse_args(int argc, const char** argv, SDParams& params) { bool invalid_arg = false; std::string arg; for (int i = 1; i < argc; i++) { @@ -496,7 +496,7 @@ bool check_params(SDParams params) { } if (params.seed < 0) { - srand((int) time(NULL)); + srand((int)time(NULL)); params.seed = rand(); } @@ -571,8 +571,7 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) { } } - -std::vector parse_cin(std::string &input, std::set ignore_args) { +std::vector parse_cin(std::string& input, std::set ignore_args) { std::vector inputTokens; std::string token; std::istringstream iss(input); @@ -588,8 +587,8 @@ std::vector parse_cin(std::string &input, std::set ign if (word[word.length() - 1] == '"') { stmt += word; - word = stmt.substr(1, stmt.length() - 2); - stmt = ""; + word = stmt.substr(1, stmt.length() - 2); + stmt = ""; in_stmt = false; } @@ -736,26 +735,26 @@ SDParams merge_params(SDParams dst, SDParams src) { class CliInstance { public: - sd_ctx_t *sd_ctx; + sd_ctx_t* sd_ctx; ~CliInstance() { free_sd_ctx(sd_ctx); } - CliInstance(const SDParams ¶ms) { + CliInstance(const SDParams& params) { sd_ctx = new_sd_ctx( - params.n_threads, - params.vae_decode_only, - true, - params.lora_model_dir.c_str(), - params.rng_type, - params.vae_tiling, - params.wtype, - params.schedule, - true); - } - - bool load_from_file(SDParams ¶ms) { + params.n_threads, + params.vae_decode_only, + true, + params.lora_model_dir.c_str(), + params.rng_type, + params.vae_tiling, + params.wtype, + params.schedule, + true); + } + + bool load_from_file(SDParams& params) { // free api always check if the following methods can free, so we can always free the model before load it. free_diffusions_params(sd_ctx); auto load_status = load_diffusions_from_file(sd_ctx, params.model_path.c_str()); @@ -778,7 +777,7 @@ class CliInstance { return load_status; } - void txtimg(SDParams ¶ms) { + void txtimg(SDParams& params) { set_options(sd_ctx, params.n_threads, params.vae_decode_only, true, @@ -787,7 +786,7 @@ class CliInstance { params.vae_tiling, params.wtype, params.schedule); - sd_image_t *results = txt2img(sd_ctx, + sd_image_t* results = txt2img(sd_ctx, params.prompt.c_str(), params.negative_prompt.c_str(), params.clip_skip, @@ -798,12 +797,11 @@ class CliInstance { params.sample_steps, params.seed, params.batch_count); - results = upscaler(params, results); + results = upscaler(params, results); save_image(params, results); - } - void imgimg(SDParams ¶ms) { + void imgimg(SDParams& params) { set_options(sd_ctx, params.n_threads, params.vae_decode_only, true, @@ -812,9 +810,9 @@ class CliInstance { params.vae_tiling, params.wtype, params.schedule); - uint8_t *input_image_buffer = NULL; + uint8_t* input_image_buffer = NULL; - int c = 0; + int c = 0; input_image_buffer = stbi_load(params.input_path.c_str(), ¶ms.width, ¶ms.height, &c, 3); if (input_image_buffer == NULL) { fprintf(stderr, "load image from '%s' failed\n", params.input_path.c_str()); @@ -837,12 +835,12 @@ class CliInstance { return; } - sd_image_t input_image = {(uint32_t) params.width, - (uint32_t) params.height, + sd_image_t input_image = {(uint32_t)params.width, + (uint32_t)params.height, 3, input_image_buffer}; - sd_image_t *results = img2img(sd_ctx, + sd_image_t* results = img2img(sd_ctx, input_image, params.prompt.c_str(), params.negative_prompt.c_str(), @@ -855,21 +853,20 @@ class CliInstance { params.strength, params.seed, params.batch_count); - results = upscaler(params, results); + results = upscaler(params, results); save_image(params, results); } protected: - - void save_image(const SDParams ¶ms, sd_image_t *results) { - size_t last = params.output_path.find_last_of("."); + void save_image(const SDParams& params, sd_image_t* results) { + size_t last = params.output_path.find_last_of("."); std::string dummy_name = last != std::string::npos ? params.output_path.substr(0, last) : params.output_path; for (int i = 0; i < params.batch_count; i++) { if (results[i].data == NULL) { continue; } std::string final_image_path = - i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ".png" : dummy_name + ".png"; + i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ".png" : dummy_name + ".png"; stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel, results[i].data, 0, get_image_params(params, params.seed + i).c_str()); printf("save result image to '%s'\n", final_image_path.c_str()); @@ -879,10 +876,10 @@ class CliInstance { free(results); } - sd_image_t *upscaler(const SDParams ¶ms, sd_image_t *results) { + sd_image_t* upscaler(const SDParams& params, sd_image_t* results) { int upscale_factor = 4; // unused for RealESRGAN_x4plus_anime_6B.pth if (params.esrgan_path.size() > 0) { - upscaler_ctx_t *upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(), + upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(), params.n_threads, params.wtype); @@ -908,7 +905,7 @@ class CliInstance { } }; -int main(int argc, const char *argv[]) { +int main(int argc, const char* argv[]) { SDParams params; parse_args(argc, argv, params); @@ -917,7 +914,7 @@ int main(int argc, const char *argv[]) { return 1; } - sd_set_log_callback(sd_log_cb, (void *) ¶ms); + sd_set_log_callback(sd_log_cb, (void*)¶ms); if (params.verbose) { print_params(params); @@ -953,14 +950,14 @@ int main(int argc, const char *argv[]) { std::string input; std::cout << "please input args: " << std::endl; std::getline(std::cin, input); - //hold an ignore cmd for feature to ignore the cmd not support + // hold an ignore cmd for feature to ignore the cmd not support std::set ignore_cmd = {""}; - std::vector args = parse_cin(input, ignore_cmd); + std::vector args = parse_cin(input, ignore_cmd); SDParams stream_params; - const char **args_c_arr = new const char *[args.size()]; + const char** args_c_arr = new const char*[args.size()]; for (int i = 0; i < args.size(); i++) { std::string arg = args[i]; - char *c_str = new char[args[i].length() + 1]; + char* c_str = new char[args[i].length() + 1]; std::strcpy(c_str, arg.c_str()); args_c_arr[i] = c_str; } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 8077aeb8..e29839b6 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -75,11 +75,11 @@ class StableDiffusionGGML { std::map loras; std::shared_ptr denoiser = std::make_shared(); - schedule_t schedule = DEFAULT; + schedule_t schedule = DEFAULT; - ggml_backend_t backend = NULL; // general backend + ggml_backend_t backend = NULL; // general backend ggml_type model_data_type = GGML_TYPE_COUNT; // runtime weight type - ggml_type wtype = GGML_TYPE_COUNT; // options weight type + ggml_type wtype = GGML_TYPE_COUNT; // options weight type TinyAutoEncoder tae_first_stage; @@ -104,15 +104,15 @@ class StableDiffusionGGML { ggml_type wtype, schedule_t schedule, bool init_backend_immediately = true) - : n_threads(n_threads), - vae_decode_only(vae_decode_only), - free_params_immediately(free_params_immediately), - lora_model_dir(lora_model_dir), - vae_tiling(vae_tiling), - wtype(wtype), - schedule(schedule) { + : n_threads(n_threads), + vae_decode_only(vae_decode_only), + free_params_immediately(free_params_immediately), + lora_model_dir(lora_model_dir), + vae_tiling(vae_tiling), + wtype(wtype), + schedule(schedule) { first_stage_model.decode_only = vae_decode_only; - tae_first_stage.decode_only = vae_decode_only; + tae_first_stage.decode_only = vae_decode_only; if (rng_type == STD_DEFAULT_RNG) { rng = std::make_shared(); } else if (rng_type == CUDA_RNG) { @@ -152,13 +152,13 @@ class StableDiffusionGGML { } void set_options(int n_threads, - bool vae_decode_only, - bool free_params_immediately, - std::string lora_model_dir, - rng_type_t rng_type, - bool vae_tiling, - sd_type_t wtype, - schedule_t schedule) { + bool vae_decode_only, + bool free_params_immediately, + std::string lora_model_dir, + rng_type_t rng_type, + bool vae_tiling, + sd_type_t wtype, + schedule_t schedule) { this->n_threads = n_threads; bool standalone = clip_path != vae_path && vae_path != unet_path; @@ -184,7 +184,7 @@ class StableDiffusionGGML { } this->free_params_immediately = free_params_immediately; - this->lora_model_dir = std::move(lora_model_dir); + this->lora_model_dir = std::move(lora_model_dir); if (rng_type == STD_DEFAULT_RNG) { rng = std::make_shared(); } else if (rng_type == CUDA_RNG) { @@ -192,14 +192,13 @@ class StableDiffusionGGML { } this->vae_tiling = vae_tiling; - if (this->wtype != (ggml_type) wtype) { - this->wtype = (ggml_type) wtype; + if (this->wtype != (ggml_type)wtype) { + this->wtype = (ggml_type)wtype; // TODO: can reload weight // if (!standalone) { // free_diffusions_params(); // load_diffusions_from_file(model_path); // } - } if (this->schedule != schedule) { @@ -208,7 +207,7 @@ class StableDiffusionGGML { } } - bool load_clip_from_file(const std::string &model_path, bool standalone = true, const std::string &prefix = "te.") { + bool load_clip_from_file(const std::string& model_path, bool standalone = true, const std::string& prefix = "te.") { if (backend == NULL) { LOG_ERROR("if you set init_backend_immediately false, please call init_backend first"); return false; @@ -274,11 +273,11 @@ class StableDiffusionGGML { } struct ggml_init_params params; - params.mem_size = static_cast(3 * 1024) * 1024; // 3M + params.mem_size = static_cast(3 * 1024) * 1024; // 3M params.mem_buffer = NULL; - params.no_alloc = false; + params.no_alloc = false; // LOG_DEBUG("mem_size %u ", params.mem_size); - struct ggml_context *ctx = ggml_init(params); // for alphas_cumprod and is_using_v_parameterization check + struct ggml_context* ctx = ggml_init(params); // for alphas_cumprod and is_using_v_parameterization check if (!ctx) { LOG_ERROR("ggml_init() failed"); return false; @@ -288,10 +287,10 @@ class StableDiffusionGGML { LOG_DEBUG("loading clip weights"); int64_t t0 = ggml_time_ms(); - std::map tensors_need_to_load; + std::map tensors_need_to_load; std::set ignore_tensors; - for (auto &pair: tensors) { + for (auto& pair : tensors) { tensors_need_to_load.insert(pair); } @@ -316,9 +315,9 @@ class StableDiffusionGGML { } } - bool load_unet_from_file(const std::string &model_path, - bool standalone = true, - const std::string &prefix = "unet.") { + bool load_unet_from_file(const std::string& model_path, + bool standalone = true, + const std::string& prefix = "unet.") { if (backend == NULL) { LOG_ERROR("if you set init_backend_immediately false, please call init_backend first"); return false; @@ -351,11 +350,11 @@ class StableDiffusionGGML { } struct ggml_init_params params; - params.mem_size = static_cast(3 * 1024) * 1024; // 3M + params.mem_size = static_cast(3 * 1024) * 1024; // 3M params.mem_buffer = NULL; - params.no_alloc = false; + params.no_alloc = false; - struct ggml_context *ctx = ggml_init(params); // for alphas_cumprod and is_using_v_parameterization check + struct ggml_context* ctx = ggml_init(params); // for alphas_cumprod and is_using_v_parameterization check if (!ctx) { LOG_ERROR("ggml_init() failed"); @@ -366,13 +365,13 @@ class StableDiffusionGGML { LOG_DEBUG("loading weights"); int64_t t0 = ggml_time_ms(); - std::map tensors_need_to_load; + std::map tensors_need_to_load; std::set ignore_tensors; - ggml_tensor *alphas_cumprod_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, TIMESTEPS); - calculate_alphas_cumprod((float *) alphas_cumprod_tensor->data); + ggml_tensor* alphas_cumprod_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, TIMESTEPS); + calculate_alphas_cumprod((float*)alphas_cumprod_tensor->data); tensors_need_to_load["alphas_cumprod"] = alphas_cumprod_tensor; - for (auto &pair: tensors) { - const std::string &name = pair.first; + for (auto& pair : tensors) { + const std::string& name = pair.first; if (starts_with(name, "cond_stage_model.") || starts_with(name, "first_stage_model.")) { ignore_tensors.insert(name); continue; @@ -415,9 +414,9 @@ class StableDiffusionGGML { } } - bool load_vae_from_file(const std::string &model_path, - bool standalone = true, - const std::string &prefix = "vae.") { + bool load_vae_from_file(const std::string& model_path, + bool standalone = true, + const std::string& prefix = "vae.") { if (backend == NULL) { LOG_ERROR("if you set init_backend_immediately false, please call init_backend first"); return false; @@ -454,11 +453,11 @@ class StableDiffusionGGML { } struct ggml_init_params params; - params.mem_size = static_cast(10 * 1024) * 1024; // 3M + params.mem_size = static_cast(10 * 1024) * 1024; // 3M params.mem_buffer = NULL; - params.no_alloc = false; + params.no_alloc = false; // LOG_DEBUG("mem_size %u ", params.mem_size); - struct ggml_context *ctx = ggml_init(params); // for alphas_cumprod and is_using_v_parameterization check + struct ggml_context* ctx = ggml_init(params); // for alphas_cumprod and is_using_v_parameterization check if (!ctx) { LOG_ERROR("ggml_init() failed"); return false; @@ -468,10 +467,10 @@ class StableDiffusionGGML { LOG_DEBUG("loading weights"); int64_t t0 = ggml_time_ms(); - std::map tensors_need_to_load; + std::map tensors_need_to_load; std::set ignore_tensors; - for (auto &pair: tensors) { - const std::string &name = pair.first; + for (auto& pair : tensors) { + const std::string& name = pair.first; if (vae_decode_only && (starts_with(name, "first_stage_model.encoder") || starts_with(name, "first_stage_model.quant"))) { ignore_tensors.insert(name); @@ -500,7 +499,7 @@ class StableDiffusionGGML { } // load the all model from one file - bool load_diffusions_from_file(const std::string &model_path) { + bool load_diffusions_from_file(const std::string& model_path) { LOG_INFO("loading model from '%s'", model_path.c_str()); if (!load_clip_from_file(model_path, false, "")) { free_clip_params(); @@ -534,7 +533,7 @@ class StableDiffusionGGML { LOG_INFO("free vae params"); } - bool load_taesd_from_file(const std::string &taesd_path) { + bool load_taesd_from_file(const std::string& taesd_path) { if (first_stage_model.params_buffer_size > 0) { free_vae_params(); } @@ -542,7 +541,7 @@ class StableDiffusionGGML { return false; } - this->taesd_path = taesd_path; + this->taesd_path = taesd_path; use_tiny_autoencoder = true; return true; } @@ -826,25 +825,25 @@ class StableDiffusionGGML { switch (schedule) { case DISCRETE: LOG_INFO("running with discrete schedule"); - denoiser->schedule = std::make_shared(); - break; + denoiser->schedule = std::make_shared(); + break; case KARRAS: LOG_INFO("running with Karras schedule"); - denoiser->schedule = std::make_shared(); - break; + denoiser->schedule = std::make_shared(); + break; case DEFAULT: // Don't touch anything. - break; + break; default: LOG_ERROR("Unknown schedule %i", schedule); - abort(); + abort(); } } for (int i = 0; i < TIMESTEPS; i++) { denoiser->schedule->alphas_cumprod[i] = alphas_cumprod_tensor[i]; - denoiser->schedule->sigmas[i] = std::sqrt( - (1 - denoiser->schedule->alphas_cumprod[i]) / denoiser->schedule->alphas_cumprod[i]); + denoiser->schedule->sigmas[i] = std::sqrt( + (1 - denoiser->schedule->alphas_cumprod[i]) / denoiser->schedule->alphas_cumprod[i]); denoiser->schedule->log_sigmas[i] = std::log(denoiser->schedule->sigmas[i]); } } @@ -1614,14 +1613,14 @@ struct sd_ctx_t { sd_ctx_t* new_sd_ctx(int n_threads, bool vae_decode_only, bool free_params_immediately, - const char *lora_model_dir_c_str, + const char* lora_model_dir_c_str, enum rng_type_t rng_type, bool vae_tiling, enum sd_type_t wtype, enum schedule_t s, bool keep_control_net_cpu, bool init_backend_immediately) { - sd_ctx_t *sd_ctx = (sd_ctx_t *) malloc(sizeof(sd_ctx_t)); + sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t)); if (sd_ctx == NULL) { return NULL; } @@ -1647,8 +1646,7 @@ void free_sd_ctx(sd_ctx_t* sd_ctx) { free(sd_ctx); } - -void init_backend(sd_ctx_t *sd_ctx) { +void init_backend(sd_ctx_t* sd_ctx) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return; @@ -1656,11 +1654,11 @@ void init_backend(sd_ctx_t *sd_ctx) { sd_ctx->sd->init_backend(); } -void set_options(sd_ctx_t *sd_ctx, +void set_options(sd_ctx_t* sd_ctx, int n_threads, bool vae_decode_only, bool free_params_immediately, - const char *lora_model_dir, + const char* lora_model_dir, rng_type_t rng_type, bool vae_tiling, sd_type_t wtype, @@ -1670,17 +1668,17 @@ void set_options(sd_ctx_t *sd_ctx, return; } sd_ctx->sd->set_options( - n_threads, - vae_decode_only, - free_params_immediately, - std::string(lora_model_dir), - rng_type, - vae_tiling, - wtype, - schedule); + n_threads, + vae_decode_only, + free_params_immediately, + std::string(lora_model_dir), + rng_type, + vae_tiling, + wtype, + schedule); } -bool load_clip_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix) { +bool load_clip_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* prefix) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return false; @@ -1688,7 +1686,7 @@ bool load_clip_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *p return sd_ctx->sd->load_clip_from_file(std::string(model_path), true, std::string(prefix)); } -void free_clip_params(sd_ctx_t *sd_ctx) { +void free_clip_params(sd_ctx_t* sd_ctx) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return; @@ -1696,7 +1694,7 @@ void free_clip_params(sd_ctx_t *sd_ctx) { sd_ctx->sd->free_clip_params(); } -bool load_unet_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix) { +bool load_unet_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* prefix) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return false; @@ -1704,7 +1702,7 @@ bool load_unet_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *p return sd_ctx->sd->load_unet_from_file(std::string(model_path), true, std::string(prefix)); } -void free_unet_params(sd_ctx_t *sd_ctx) { +void free_unet_params(sd_ctx_t* sd_ctx) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return; @@ -1712,7 +1710,7 @@ void free_unet_params(sd_ctx_t *sd_ctx) { sd_ctx->sd->free_unet_params(); } -bool load_vae_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *prefix) { +bool load_vae_from_file(sd_ctx_t* sd_ctx, const char* model_path, const char* prefix) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return false; @@ -1720,7 +1718,7 @@ bool load_vae_from_file(sd_ctx_t *sd_ctx, const char *model_path, const char *pr return sd_ctx->sd->load_vae_from_file(std::string(model_path), true, std::string(prefix)); } -void free_vae_params(sd_ctx_t *sd_ctx) { +void free_vae_params(sd_ctx_t* sd_ctx) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return; @@ -1728,7 +1726,7 @@ void free_vae_params(sd_ctx_t *sd_ctx) { sd_ctx->sd->free_vae_params(); } -bool load_taesd_from_file(sd_ctx_t *sd_ctx, const char *model_path) { +bool load_taesd_from_file(sd_ctx_t* sd_ctx, const char* model_path) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return false; @@ -1736,7 +1734,7 @@ bool load_taesd_from_file(sd_ctx_t *sd_ctx, const char *model_path) { return sd_ctx->sd->load_taesd_from_file(std::string(model_path)); } -void free_taesd_params(sd_ctx_t *sd_ctx) { +void free_taesd_params(sd_ctx_t* sd_ctx) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return; @@ -1745,7 +1743,7 @@ void free_taesd_params(sd_ctx_t *sd_ctx) { } // load all model from one file -bool load_diffusions_from_file(sd_ctx_t *sd_ctx, const char *model_path) { +bool load_diffusions_from_file(sd_ctx_t* sd_ctx, const char* model_path) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return false; @@ -1754,7 +1752,7 @@ bool load_diffusions_from_file(sd_ctx_t *sd_ctx, const char *model_path) { } // free all model from one file -void free_diffusions_params(sd_ctx_t *sd_ctx) { +void free_diffusions_params(sd_ctx_t* sd_ctx) { if (sd_ctx == NULL || sd_ctx->sd == NULL) { LOG_ERROR("must call new_sd_ctx first"); return; diff --git a/stable-diffusion.h b/stable-diffusion.h index a9f142af..642f3a3d 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -65,12 +65,12 @@ enum sd_type_t { SD_TYPE_Q8_0 = 8, SD_TYPE_Q8_1 = 9, // k-quantizations - SD_TYPE_Q2_K = 10, - SD_TYPE_Q3_K = 11, - SD_TYPE_Q4_K = 12, - SD_TYPE_Q5_K = 13, - SD_TYPE_Q6_K = 14, - SD_TYPE_Q8_K = 15, + SD_TYPE_Q2_K = 10, + SD_TYPE_Q3_K = 11, + SD_TYPE_Q4_K = 12, + SD_TYPE_Q5_K = 13, + SD_TYPE_Q6_K = 14, + SD_TYPE_Q8_K = 15, SD_TYPE_IQ2_XXS = 16, SD_TYPE_I8, SD_TYPE_I16, From 1e7ea7bf7a3537139ef8a71d805d472d2c46d3c4 Mon Sep 17 00:00:00 2001 From: Cyberhan123 <255542417@qq.com> Date: Sun, 4 Feb 2024 17:05:55 +0800 Subject: [PATCH 6/8] fix cli --- examples/cli/main.cpp | 28 ++++++++++++++++++++++++---- stable-diffusion.cpp | 3 +++ stable-diffusion.h | 15 ++++++--------- 3 files changed, 33 insertions(+), 13 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 0ab66e70..6045a4a7 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -745,12 +745,13 @@ class CliInstance { sd_ctx = new_sd_ctx( params.n_threads, params.vae_decode_only, - true, + false, params.lora_model_dir.c_str(), params.rng_type, params.vae_tiling, params.wtype, params.schedule, + params.control_net_cpu, true); } @@ -786,6 +787,23 @@ class CliInstance { params.vae_tiling, params.wtype, params.schedule); + int c = 0; + uint8_t* input_image_buffer = stbi_load(params.control_image_path.c_str(), ¶ms.width, ¶ms.height, &c, 3); + if (input_image_buffer == NULL) { + fprintf(stderr, "load image from '%s' failed\n", params.control_image_path.c_str()); + return; + } + if (c != 3) { + fprintf(stderr, "input image must be a 3 channels RGB image, but got %d channels\n", c); + free(input_image_buffer); + return; + } + + sd_image_t input_image = {(uint32_t)params.width, + (uint32_t)params.height, + 3, + input_image_buffer}; + sd_image_t* results = txt2img(sd_ctx, params.prompt.c_str(), params.negative_prompt.c_str(), @@ -796,8 +814,11 @@ class CliInstance { params.sample_method, params.sample_steps, params.seed, - params.batch_count); - results = upscaler(params, results); + params.batch_count, + &input_image, + params.control_strength); + + results = upscaler(params, results); save_image(params, results); } @@ -882,7 +903,6 @@ class CliInstance { upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(), params.n_threads, params.wtype); - if (upscaler_ctx == NULL) { printf("new_upscaler_ctx failed\n"); } else { diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index e29839b6..bbfeb8c8 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -552,6 +552,9 @@ class StableDiffusionGGML { } } + bool load_control_net_from_file(const std::string& control_path) { + } + bool load_from_file(const std::string& model_path, const std::string& vae_path, const std::string& control_net_path, diff --git a/stable-diffusion.h b/stable-diffusion.h index 642f3a3d..bc3637da 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -102,19 +102,16 @@ typedef struct { typedef struct sd_ctx_t sd_ctx_t; -SD_API sd_ctx_t* new_sd_ctx(const char* model_path, - const char* vae_path, - const char* taesd_path, - const char* control_net_path_c_str, - const char* lora_model_dir, - const char* embed_dir_c_str, +SD_API sd_ctx_t* new_sd_ctx(int n_threads, bool vae_decode_only, - bool vae_tiling, bool free_params_immediately, - int n_threads, + const char* lora_model_dir_c_str, + enum rng_type_t rng_type, + bool vae_tiling, enum sd_type_t wtype, enum schedule_t s, - bool init_backend_immediately = true); + bool keep_control_net_cpu, + bool init_backend_immediately); SD_API void free_sd_ctx(sd_ctx_t* sd_ctx); From bdb8250d972a4b874ba0c1ae106141c0c628ae77 Mon Sep 17 00:00:00 2001 From: Cyberhan123 <255542417@qq.com> Date: Sun, 4 Feb 2024 17:17:12 +0800 Subject: [PATCH 7/8] fix build fail on darwin and linux --- examples/cli/main.cpp | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 6045a4a7..09abee85 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -98,18 +98,6 @@ struct SDParams { bool canny_preprocess = false; }; -static std::string sd_basename(const std::string& path) { - size_t pos = path.find_last_of('/'); - if (pos != std::string::npos) { - return path.substr(pos + 1); - } - pos = path.find_last_of('\\'); - if (pos != std::string::npos) { - return path.substr(pos + 1); - } - return path; -} - void print_params(SDParams params) { printf("Option: \n"); printf(" n_threads: %d\n", params.n_threads); From f1f24450819769468250d742e72a2eec076e50e3 Mon Sep 17 00:00:00 2001 From: Cyberhan123 <255542417@qq.com> Date: Mon, 5 Feb 2024 16:40:34 +0800 Subject: [PATCH 8/8] format code --- examples/cli/main.cpp | 80 +++++++++++++++++++++++++++++++++++++++++++ model.cpp | 65 ++++++++++++----------------------- stable-diffusion.cpp | 22 ++++++++++-- 3 files changed, 121 insertions(+), 46 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 09abee85..50a234b7 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -651,6 +651,18 @@ SDParams merge_params(SDParams dst, SDParams src) { } } + if (dst.controlnet_path != src.control_image_path) { + if (!src.controlnet_path.empty()) { + dst.controlnet_path = src.controlnet_path; + } + } + + if (dst.embeddings_path != src.embeddings_path) { + if (!src.embeddings_path.empty()) { + dst.embeddings_path = src.embeddings_path; + } + } + if (dst.wtype != src.wtype) { dst.wtype = src.wtype; } @@ -667,6 +679,18 @@ SDParams merge_params(SDParams dst, SDParams src) { } } + if (dst.input_path != src.input_path) { + if (!src.input_path.empty()) { + dst.input_path = src.input_path; + } + } + + if (dst.control_image_path != src.control_image_path) { + if (!src.control_image_path.empty()) { + dst.control_image_path = src.control_image_path; + } + } + if (dst.prompt != src.prompt) { if (!src.prompt.empty()) { dst.prompt = src.prompt; @@ -701,6 +725,30 @@ SDParams merge_params(SDParams dst, SDParams src) { } } + if (dst.batch_count != src.batch_count) { + if (src.batch_count > 0) { + dst.batch_count = src.batch_count; + } + } + + if (dst.batch_count != src.batch_count) { + if (src.batch_count > 0) { + dst.batch_count = src.batch_count; + } + } + + if (dst.sample_method != src.sample_method) { + if (src.sample_method < N_SAMPLE_METHODS) { + dst.sample_method = src.sample_method; + } + } + + if (dst.schedule != src.schedule) { + if (src.schedule < N_SAMPLE_METHODS) { + dst.schedule = src.schedule; + } + } + if (dst.sample_steps != src.sample_steps) { if (src.sample_steps > 0) { dst.sample_steps = src.sample_steps; @@ -713,11 +761,43 @@ SDParams merge_params(SDParams dst, SDParams src) { } } + if (dst.control_strength != src.control_strength) { + if (src.control_strength >= 0.f && src.control_strength <= 1.f) { + dst.control_strength = src.control_strength; + } + } + + if (dst.rng_type != src.rng_type) { + if (src.rng_type < CUDA_RNG) { + dst.rng_type = src.rng_type; + } + } + if (dst.seed != src.seed) { if (src.seed > 0) { dst.seed = src.seed; } } + + if (dst.verbose != src.verbose) { + dst.verbose = src.verbose; + } + + if (dst.vae_tiling != src.vae_tiling) { + dst.verbose = src.verbose; + } + + if (dst.vae_decode_only != src.vae_decode_only) { + dst.vae_decode_only = src.vae_decode_only; + } + + if (dst.control_net_cpu != src.control_net_cpu) { + dst.control_net_cpu = src.control_net_cpu; + } + + if (dst.canny_preprocess != src.canny_preprocess) { + dst.canny_preprocess = src.canny_preprocess; + } return dst; } diff --git a/model.cpp b/model.cpp index 847f612c..59e788b1 100644 --- a/model.cpp +++ b/model.cpp @@ -275,48 +275,40 @@ std::string convert_diffusers_name_to_compvis(const std::string& key, char seq) } if (match(m, std::regex(format("unet%ctime_embedding%clinear_(\\d+)(.*)", seq, seq)), key)) { - return format("model%cdiffusion_model%ctime_embed%c", seq, seq, seq) + std::to_string(std::stoi(m[0]) * 2 - 2) + - m[1]; + return format("model%cdiffusion_model%ctime_embed%c", seq, seq, seq) + std::to_string(std::stoi(m[0]) * 2 - 2) + m[1]; } if (match(m, std::regex(format("unet%cdown_blocks%c(\\d+)%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) { std::string suffix = get_converted_suffix(m[1], m[3]); // LOG_DEBUG("%s %s %s %s", m[0].c_str(), m[1].c_str(), m[2].c_str(), m[3].c_str()); - return format("model%cdiffusion_model%cinput_blocks%c", seq, seq, seq) + - std::to_string(1 + std::stoi(m[0]) * 3 + std::stoi(m[2])) + seq + + return format("model%cdiffusion_model%cinput_blocks%c", seq, seq, seq) + std::to_string(1 + std::stoi(m[0]) * 3 + std::stoi(m[2])) + seq + (m[1] == "attentions" ? "1" : "0") + seq + suffix; } if (match(m, std::regex(format("unet%cmid_block%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq)), key)) { std::string suffix = get_converted_suffix(m[0], m[2]); - return format("model%cdiffusion_model%cmiddle_block%c", seq, seq, seq) + - (m[0] == "attentions" ? "1" : std::to_string(std::stoi(m[1]) * 2)) + + return format("model%cdiffusion_model%cmiddle_block%c", seq, seq, seq) + (m[0] == "attentions" ? "1" : std::to_string(std::stoi(m[1]) * 2)) + seq + suffix; } if (match(m, std::regex(format("unet%cup_blocks%c(\\d+)%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) { std::string suffix = get_converted_suffix(m[1], m[3]); - return format("model%cdiffusion_model%coutput_blocks%c", seq, seq, seq) + - std::to_string(std::stoi(m[0]) * 3 + std::stoi(m[2])) + seq + + return format("model%cdiffusion_model%coutput_blocks%c", seq, seq, seq) + std::to_string(std::stoi(m[0]) * 3 + std::stoi(m[2])) + seq + (m[1] == "attentions" ? "1" : "0") + seq + suffix; } - if (match(m, std::regex(format("unet%cdown_blocks%c(\\d+)%cdownsamplers%c0%cconv", seq, seq, seq, seq, seq)), - key)) { - return format("model%cdiffusion_model%cinput_blocks%c", seq, seq, seq) + - std::to_string(3 + std::stoi(m[0]) * 3) + seq + "0" + seq + "op"; + if (match(m, std::regex(format("unet%cdown_blocks%c(\\d+)%cdownsamplers%c0%cconv", seq, seq, seq, seq, seq)), key)) { + return format("model%cdiffusion_model%cinput_blocks%c", seq, seq, seq) + std::to_string(3 + std::stoi(m[0]) * 3) + seq + "0" + seq + "op"; } if (match(m, std::regex(format("unet%cup_blocks%c(\\d+)%cupsamplers%c0%cconv", seq, seq, seq, seq, seq)), key)) { - return format("model%cdiffusion_model%coutput_blocks%c", seq, seq, seq) + - std::to_string(2 + std::stoi(m[0]) * 3) + seq + + return format("model%cdiffusion_model%coutput_blocks%c", seq, seq, seq) + std::to_string(2 + std::stoi(m[0]) * 3) + seq + (std::stoi(m[0]) > 0 ? "2" : "1") + seq + "conv"; } // clip if (match(m, std::regex(format("te%ctext_model%cencoder%clayers%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) { - return format("cond_stage_model%ctransformer%ctext_model%cencoder%clayers%c", seq, seq, seq, seq, seq) + m[0] + - seq + m[1]; + return format("cond_stage_model%ctransformer%ctext_model%cencoder%clayers%c", seq, seq, seq, seq, seq) + m[0] + seq + m[1]; } if (match(m, std::regex(format("te%ctext_model(.*)", seq)), key)) { @@ -328,9 +320,7 @@ std::string convert_diffusers_name_to_compvis(const std::string& key, char seq) return format("first_stage_model%c%s%cnorm_out%s", seq, m[0].c_str(), seq, m[1].c_str()); } - if (match(m, - std::regex(format("vae%c(.*)%cmid_block%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), - key)) { + if (match(m, std::regex(format("vae%c(.*)%cmid_block%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) { std::string suffix; std::string block_name; if (m[1] == "attentions") { @@ -344,9 +334,7 @@ std::string convert_diffusers_name_to_compvis(const std::string& key, char seq) seq, m[0].c_str(), seq, seq, block_name.c_str(), std::stoi(m[2]) + 1, seq, suffix.c_str()); } - if (match(m, - std::regex(format("vae%c(.*)%cup_blocks%c(\\d+)%cresnets%c(\\d+)%c(.+)", seq, seq, seq, seq, seq, seq)), - key)) { + if (match(m, std::regex(format("vae%c(.*)%cup_blocks%c(\\d+)%cresnets%c(\\d+)%c(.+)", seq, seq, seq, seq, seq, seq)), key)) { std::string suffix = m[3]; if (suffix == "conv_shortcut") { suffix = "nin_shortcut"; @@ -355,16 +343,12 @@ std::string convert_diffusers_name_to_compvis(const std::string& key, char seq) seq, m[0].c_str(), seq, seq, 3 - std::stoi(m[1]), seq, seq, m[2].c_str(), seq, suffix.c_str()); } - if (match(m, - std::regex(format("vae%c(.*)%cdown_blocks%c(\\d+)%cdownsamplers%c0%cconv", seq, seq, seq, seq, seq, seq)), - key)) { + if (match(m, std::regex(format("vae%c(.*)%cdown_blocks%c(\\d+)%cdownsamplers%c0%cconv", seq, seq, seq, seq, seq, seq)), key)) { return format("first_stage_model%c%s%cdown%c%d%cdownsample%cconv", seq, m[0].c_str(), seq, seq, std::stoi(m[1]), seq, seq); } - if (match(m, - std::regex(format("vae%c(.*)%cdown_blocks%c(\\d+)%cresnets%c(\\d+)%c(.+)", seq, seq, seq, seq, seq, seq)), - key)) { + if (match(m, std::regex(format("vae%c(.*)%cdown_blocks%c(\\d+)%cresnets%c(\\d+)%c(.+)", seq, seq, seq, seq, seq, seq)), key)) { std::string suffix = m[3]; if (suffix == "conv_shortcut") { suffix = "nin_shortcut"; @@ -373,8 +357,7 @@ std::string convert_diffusers_name_to_compvis(const std::string& key, char seq) seq, m[0].c_str(), seq, seq, std::stoi(m[1]), seq, seq, m[2].c_str(), seq, suffix.c_str()); } - if (match(m, std::regex(format("vae%c(.*)%cup_blocks%c(\\d+)%cupsamplers%c0%cconv", seq, seq, seq, seq, seq, seq)), - key)) { + if (match(m, std::regex(format("vae%c(.*)%cup_blocks%c(\\d+)%cupsamplers%c0%cconv", seq, seq, seq, seq, seq, seq)), key)) { return format("first_stage_model%c%s%cup%c%d%cupsample%cconv", seq, m[0].c_str(), seq, seq, 3 - std::stoi(m[1]), seq, seq); } @@ -511,9 +494,8 @@ void convert_tensor(void* src, ggml_type src_type, void* dst, ggml_type dst_type } else { auto qtype = ggml_internal_get_type_traits(src_type); if (qtype.to_float == NULL) { - throw std::runtime_error( - format("type %s unsupported for integer quantization: no dequantization available", - ggml_type_name(src_type))); + throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", + ggml_type_name(src_type))); } qtype.to_float(src, (float*)dst, n); } @@ -807,8 +789,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const ne[i] = shape[i].get(); } - TensorStorage tensor_storage(prefix + name, type, ne, n_dims, file_index, - ST_HEADER_SIZE_LEN + header_size_ + begin); + TensorStorage tensor_storage(prefix + name, type, ne, n_dims, file_index, ST_HEADER_SIZE_LEN + header_size_ + begin); tensor_storage.reverse_ne(); @@ -1067,7 +1048,7 @@ bool ModelLoader::parse_data_pkl(uint8_t* buffer, break; case ']': // EMPTY_LIST = b']' # push empty list break; - // skip unused sections + // skip unused sections case 'h': // BINGET = b'h' # " " " " " " ; " " 1-byte arg case 'q': // BINPUT = b'q' # " " " " " ; " " 1-byte arg case 'Q': // BINPERSID = b'Q' # " " " ; " " " " stack @@ -1360,8 +1341,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend if (tensor_storage.is_bf16) { // inplace op - bf16_to_f32_vec((uint16_t*)dst_tensor->data, (float*)dst_tensor->data, - tensor_storage.nelements()); + bf16_to_f32_vec((uint16_t*)dst_tensor->data, (float*)dst_tensor->data, tensor_storage.nelements()); } } else { read_buffer.resize(tensor_storage.nbytes()); @@ -1369,8 +1349,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend if (tensor_storage.is_bf16) { // inplace op - bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), - tensor_storage.nelements()); + bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); } convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data, @@ -1382,8 +1361,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend if (tensor_storage.is_bf16) { // inplace op - bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), - tensor_storage.nelements()); + bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); } if (tensor_storage.type == dst_tensor->type) { @@ -1443,8 +1421,7 @@ bool ModelLoader::load_tensors(std::map& tenso "tensor '%s' has wrong shape in model file: " "got [%d, %d, %d, %d], expected [%d, %d, %d, %d]", name.c_str(), - (int)tensor_storage.ne[0], (int)tensor_storage.ne[1], (int)tensor_storage.ne[2], - (int)tensor_storage.ne[3], + (int)tensor_storage.ne[0], (int)tensor_storage.ne[1], (int)tensor_storage.ne[2], (int)tensor_storage.ne[3], (int)real->ne[0], (int)real->ne[1], (int)real->ne[2], (int)real->ne[3]); return false; } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index bbfeb8c8..9f556dc4 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -552,7 +552,25 @@ class StableDiffusionGGML { } } - bool load_control_net_from_file(const std::string& control_path) { + bool load_control_net_from_file(const std::string& control_net_path, const std::string& embeddings_path, bool control_net_cpu) { + if (!control_net_path.empty()) { + ggml_backend_t cn_backend = NULL; + if (control_net_cpu && !ggml_backend_is_cpu(backend)) { + LOG_DEBUG("ControlNet: Using CPU backend"); + cn_backend = ggml_backend_cpu_init(); + } else { + cn_backend = backend; + } + if (!control_net.load_from_file(control_net_path, cn_backend, GGML_TYPE_F16 /* just f16 controlnet models */)) { + return false; + } + } + } + + void free_control_net_params() { + if (control_net.params_buffer_size > 0) { + control_net.free_params_buffer(); + } } bool load_from_file(const std::string& model_path, @@ -560,7 +578,7 @@ class StableDiffusionGGML { const std::string& control_net_path, const std::string& embeddings_path, const std::string& taesd_path, - bool vae_tiling_, + bool vae_tiling, ggml_type wtype, schedule_t schedule, bool control_net_cpu) {