Skip to content

[WIP] Add Deepcache #705

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions diffusion_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ struct DiffusionModel {
int num_video_frames = -1,
std::vector<struct ggml_tensor*> controls = {},
float control_strength = 0.f,
struct ggml_context* persistent_work_ctx = NULL,
struct ggml_tensor** output = NULL,
struct ggml_context* output_ctx = NULL,
std::vector<int> skip_layers = std::vector<int>()) = 0;
Expand All @@ -33,8 +34,14 @@ struct UNetModel : public DiffusionModel {
UNetModel(ggml_backend_t backend,
std::map<std::string, enum ggml_type>& tensor_types,
SDVersion version = VERSION_SD1,
bool flash_attn = false)
: unet(backend, tensor_types, "model.diffusion_model", version, flash_attn) {
bool flash_attn = false,
// DeepCache parameters
int dc_cache_interval = 0,
int dc_cache_depth = 3,
int dc_start_steps = 0,
int dc_end_steps = 9999)
: unet(backend, tensor_types, "model.diffusion_model", version, flash_attn,
dc_cache_interval, dc_cache_depth, dc_start_steps, dc_end_steps) {
}

void alloc_params_buffer() {
Expand Down Expand Up @@ -71,13 +78,14 @@ struct UNetModel : public DiffusionModel {
int num_video_frames = -1,
std::vector<struct ggml_tensor*> controls = {},
float control_strength = 0.f,
struct ggml_context* persistent_work_ctx = NULL,
struct ggml_tensor** output = NULL,
struct ggml_context* output_ctx = NULL,
std::vector<int> skip_layers = std::vector<int>()) {
(void)skip_layers; // SLG doesn't work with UNet models
return unet.compute(n_threads, x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength, output, output_ctx);
return unet.compute(n_threads, x, timesteps, context, c_concat, y, persistent_work_ctx, num_video_frames, controls, control_strength, output, output_ctx);
}
};
};;

struct MMDiTModel : public DiffusionModel {
MMDiTRunner mmdit;
Expand Down Expand Up @@ -121,9 +129,11 @@ struct MMDiTModel : public DiffusionModel {
int num_video_frames = -1,
std::vector<struct ggml_tensor*> controls = {},
float control_strength = 0.f,
struct ggml_context* persistent_work_ctx = NULL,
struct ggml_tensor** output = NULL,
struct ggml_context* output_ctx = NULL,
std::vector<int> skip_layers = std::vector<int>()) {
(void)persistent_work_ctx; // Not used by MMDiT
return mmdit.compute(n_threads, x, timesteps, context, y, output, output_ctx, skip_layers);
}
};
Expand Down Expand Up @@ -172,9 +182,11 @@ struct FluxModel : public DiffusionModel {
int num_video_frames = -1,
std::vector<struct ggml_tensor*> controls = {},
float control_strength = 0.f,
struct ggml_context* persistent_work_ctx = NULL,
struct ggml_tensor** output = NULL,
struct ggml_context* output_ctx = NULL,
std::vector<int> skip_layers = std::vector<int>()) {
(void)persistent_work_ctx; // Not used by Flux
return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, output, output_ctx, skip_layers);
}
};
Expand Down
57 changes: 56 additions & 1 deletion examples/cli/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,12 @@ struct SDParams {
float slg_scale = 0.f;
float skip_layer_start = 0.01f;
float skip_layer_end = 0.2f;

// DeepCache parameters
int dc_cache_interval = 0; // 0 to disable
int dc_cache_depth = 3;
int dc_start_steps = 0;
int dc_end_steps = 9999; // Effectively all steps
};

void print_params(SDParams params) {
Expand Down Expand Up @@ -178,6 +184,10 @@ void print_params(SDParams params) {
printf(" batch_count: %d\n", params.batch_count);
printf(" vae_tiling: %s\n", params.vae_tiling ? "true" : "false");
printf(" upscale_repeats: %d\n", params.upscale_repeats);
if (params.dc_cache_interval > 0) {
printf(" deepcache: interval=%d, depth=%d, start=%d, end=%d\n",
params.dc_cache_interval, params.dc_cache_depth, params.dc_start_steps, params.dc_end_steps);
}
}

void print_usage(int argc, const char* argv[]) {
Expand Down Expand Up @@ -244,6 +254,7 @@ void print_usage(int argc, const char* argv[]) {
printf(" --control-net-cpu keep controlnet in cpu (for low vram)\n");
printf(" --canny apply canny preprocessor (edge detection)\n");
printf(" --color Colors the logging tags according to level\n");
printf(" --deepcache CACHE_PARAMS Enable DeepCache for UNet. CACHE_PARAMS are comma-separated: interval,depth,start_steps,end_steps. Example: \"3,3,0,1000\"\n");
printf(" -v, --verbose print extra info\n");
}

Expand Down Expand Up @@ -629,6 +640,46 @@ void parse_args(int argc, const char** argv, SDParams& params) {
break;
}
params.skip_layer_end = std::stof(argv[i]);
} else if (arg == "--deepcache") {
if (++i >= argc) {
invalid_arg = true;
break;
}
std::string dc_params_str = argv[i];
std::vector<std::string> dc_tokens;
size_t start = 0;
size_t end = dc_params_str.find(',');
while (end != std::string::npos) {
dc_tokens.push_back(dc_params_str.substr(start, end - start));
start = end + 1;
end = dc_params_str.find(',', start);
}
dc_tokens.push_back(dc_params_str.substr(start));

if (dc_tokens.size() != 4) {
fprintf(stderr, "error: --deepcache requires 4 comma-separated values: interval,depth,start_steps,end_steps\n");
exit(1);
}
try {
params.dc_cache_interval = std::stoi(dc_tokens[0]);
params.dc_cache_depth = std::stoi(dc_tokens[1]);
params.dc_start_steps = std::stoi(dc_tokens[2]);
params.dc_end_steps = std::stoi(dc_tokens[3]);
if (params.dc_cache_interval <= 0) {
fprintf(stderr, "error: deepcache interval must be > 0\n");
exit(1);
}
if (params.dc_cache_depth < 0) {
fprintf(stderr, "error: deepcache depth must be >= 0\n");
exit(1);
}
} catch (const std::invalid_argument& e) {
fprintf(stderr, "error: invalid number in --deepcache parameters: %s\n", e.what());
exit(1);
} catch (const std::out_of_range& e) {
fprintf(stderr, "error: number out of range in --deepcache parameters: %s\n", e.what());
exit(1);
}
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
print_usage(argc, argv);
Expand Down Expand Up @@ -900,7 +951,11 @@ int main(int argc, const char* argv[]) {
params.clip_on_cpu,
params.control_net_cpu,
params.vae_on_cpu,
params.diffusion_flash_attn);
params.diffusion_flash_attn,
params.dc_cache_interval,
params.dc_cache_depth,
params.dc_start_steps,
params.dc_end_steps);

if (sd_ctx == NULL) {
printf("new_sd_ctx_t failed\n");
Expand Down
77 changes: 70 additions & 7 deletions stable-diffusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,17 +112,32 @@ class StableDiffusionGGML {

std::shared_ptr<Denoiser> denoiser = std::make_shared<CompVisDenoiser>();

// DeepCache parameters for UNet
int dc_cache_interval_unet_ = 0;
int dc_cache_depth_unet_ = 3;
int dc_start_steps_unet_ = 0;
int dc_end_steps_unet_ = 9999;

StableDiffusionGGML() = default;

StableDiffusionGGML(int n_threads,
bool vae_decode_only,
bool free_params_immediately,
std::string lora_model_dir,
rng_type_t rng_type)
rng_type_t rng_type,
// DeepCache parameters
int dc_cache_interval,
int dc_cache_depth,
int dc_start_steps,
int dc_end_steps)
: n_threads(n_threads),
vae_decode_only(vae_decode_only),
free_params_immediately(free_params_immediately),
lora_model_dir(lora_model_dir) {
lora_model_dir(lora_model_dir),
dc_cache_interval_unet_(dc_cache_interval),
dc_cache_depth_unet_(dc_cache_depth),
dc_start_steps_unet_(dc_start_steps),
dc_end_steps_unet_(dc_end_steps) {
if (rng_type == STD_DEFAULT_RNG) {
rng = std::make_shared<STDDefaultRNG>();
} else if (rng_type == CUDA_RNG) {
Expand Down Expand Up @@ -342,7 +357,14 @@ class StableDiffusionGGML {
} else {
cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_loader.tensor_storages_types, embeddings_path, version);
}
diffusion_model = std::make_shared<UNetModel>(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn);
LOG_DEBUG("DeepCache: StableDiffusionGGML::load_from_file. About to create UNetModel. " \
"this->dc_cache_interval_unet_: %d, this->dc_cache_depth_unet_: %d, " \
"this->dc_start_steps_unet_: %d, this->dc_end_steps_unet_: %d",
this->dc_cache_interval_unet_, this->dc_cache_depth_unet_,
this->dc_start_steps_unet_, this->dc_end_steps_unet_);
diffusion_model = std::make_shared<UNetModel>(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn,
this->dc_cache_interval_unet_, this->dc_cache_depth_unet_,
this->dc_start_steps_unet_, this->dc_end_steps_unet_);
}

cond_stage_model->alloc_params_buffer();
Expand Down Expand Up @@ -617,8 +639,11 @@ class StableDiffusionGGML {
}

int64_t t0 = ggml_time_ms();
struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t);
diffusion_model->compute(n_threads, x_t, timesteps, c, concat, NULL, NULL, -1, {}, 0.f, &out);
struct ggml_tensor* out = NULL; // Output tensor will be allocated by compute if output_ctx is provided
struct ggml_tensor* out_tensor = ggml_dup_tensor(work_ctx, x_t);

// diffusion_model->compute(n_threads, x, timesteps, context, c_concat, y, guidance, num_video_frames, controls, control_strength, persistent_work_ctx, output, output_ctx, skip_layers)
diffusion_model->compute(n_threads, x_t, timesteps, c, concat, NULL, NULL, -1, {}, 0.f, work_ctx, &out_tensor, work_ctx);
diffusion_model->free_compute_buffer();

double result = 0.f;
Expand Down Expand Up @@ -890,6 +915,7 @@ class StableDiffusionGGML {
-1,
controls,
control_strength,
work_ctx,
&out_cond);
} else {
diffusion_model->compute(n_threads,
Expand All @@ -902,6 +928,7 @@ class StableDiffusionGGML {
-1,
controls,
control_strength,
work_ctx,
&out_cond);
}

Expand All @@ -922,6 +949,7 @@ class StableDiffusionGGML {
-1,
controls,
control_strength,
work_ctx,
&out_uncond);
negative_data = (float*)out_uncond->data;
}
Expand All @@ -942,6 +970,7 @@ class StableDiffusionGGML {
-1,
controls,
control_strength,
work_ctx,
&out_skip,
NULL,
skip_layers);
Expand Down Expand Up @@ -1130,7 +1159,12 @@ sd_ctx_t* new_sd_ctx(const char* model_path_c_str,
bool keep_clip_on_cpu,
bool keep_control_net_cpu,
bool keep_vae_on_cpu,
bool diffusion_flash_attn) {
bool diffusion_flash_attn,
// DeepCache parameters
int dc_cache_interval,
int dc_cache_depth,
int dc_start_steps,
int dc_end_steps) {
sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t));
if (sd_ctx == NULL) {
return NULL;
Expand All @@ -1151,7 +1185,11 @@ sd_ctx_t* new_sd_ctx(const char* model_path_c_str,
vae_decode_only,
free_params_immediately,
lora_model_dir,
rng_type);
rng_type,
dc_cache_interval,
dc_cache_depth,
dc_start_steps,
dc_end_steps);
if (sd_ctx->sd == NULL) {
return NULL;
}
Expand Down Expand Up @@ -1439,6 +1477,15 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, batch_count, cur_seed);

sd_ctx->sd->rng->manual_seed(cur_seed);

// Reset DeepCache state for the UNet model for this new image/seed

auto unet_model = std::dynamic_pointer_cast<UNetModel>(sd_ctx->sd->diffusion_model);
if (unet_model) {
unet_model->unet.unet.reset_deepcache_state();
}


struct ggml_tensor* x_t = init_latent;
struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng);
Expand Down Expand Up @@ -1561,6 +1608,14 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
if (sd_ctx->sd->stacked_id) {
params.mem_size += static_cast<size_t>(10 * 1024 * 1024); // 10 MB
}

auto unet_model = std::dynamic_pointer_cast<UNetModel>(sd_ctx->sd->diffusion_model);
if (unet_model && unet_model->unet.unet.dc_cache_interval_ > 0) {
LOG_DEBUG("Allocating extra memory for DeepCache tensor");
size_t cache_tensor_size = 1280 * (height/8) * (width/8) * ggml_type_size(sd_ctx->sd->model_wtype);
params.mem_size += cache_tensor_size;
}

params.mem_size += width * height * 3 * sizeof(float);
params.mem_size *= batch_count;
params.mem_buffer = NULL;
Expand Down Expand Up @@ -1673,6 +1728,14 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
if (sd_ctx->sd->stacked_id) {
params.mem_size += static_cast<size_t>(10 * 1024 * 1024); // 10 MB
}

auto unet_model = std::dynamic_pointer_cast<UNetModel>(sd_ctx->sd->diffusion_model);
if (unet_model && unet_model->unet.unet.dc_cache_interval_ > 0) {
LOG_DEBUG("Allocating extra memory for DeepCache tensor");
size_t cache_tensor_size = 1280 * (height/8) * (width/8) * ggml_type_size(sd_ctx->sd->model_wtype);
params.mem_size += cache_tensor_size;
}

params.mem_size += width * height * 3 * sizeof(float) * 3;
params.mem_size *= batch_count;
params.mem_buffer = NULL;
Expand Down
7 changes: 6 additions & 1 deletion stable-diffusion.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,12 @@ SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
bool keep_clip_on_cpu,
bool keep_control_net_cpu,
bool keep_vae_on_cpu,
bool diffusion_flash_attn);
bool diffusion_flash_attn,
// DeepCache parameters
int dc_cache_interval = 0,
int dc_cache_depth = 3,
int dc_start_steps = 0,
int dc_end_steps = 9999);

SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);

Expand Down
Loading