leejet · rmatif · Jun 4, 2025 · Jun 5, 2025 · Jun 5, 2025 · Jun 18, 2025
diff --git a/diffusion_model.hpp b/diffusion_model.hpp
@@ -16,6 +16,7 @@ struct DiffusionModel {
                          int num_video_frames                      = -1,
                          std::vector<struct ggml_tensor*> controls = {},
                          float control_strength                    = 0.f,
+                         struct ggml_context* persistent_work_ctx  = NULL,
                          struct ggml_tensor** output               = NULL,
                          struct ggml_context* output_ctx           = NULL,
                          std::vector<int> skip_layers              = std::vector<int>())             = 0;
@@ -33,8 +34,14 @@ struct UNetModel : public DiffusionModel {
     UNetModel(ggml_backend_t backend,
               std::map<std::string, enum ggml_type>& tensor_types,
               SDVersion version = VERSION_SD1,
-              bool flash_attn   = false)
-        : unet(backend, tensor_types, "model.diffusion_model", version, flash_attn) {
+              bool flash_attn   = false,
+              // DeepCache parameters
+              int dc_cache_interval = 0,
+              int dc_cache_depth    = 3,
+              int dc_start_steps    = 0,
+              int dc_end_steps      = 9999)
+        : unet(backend, tensor_types, "model.diffusion_model", version, flash_attn,
+               dc_cache_interval, dc_cache_depth, dc_start_steps, dc_end_steps) {
     }
 
     void alloc_params_buffer() {
@@ -71,13 +78,14 @@ struct UNetModel : public DiffusionModel {
                  int num_video_frames                      = -1,
                  std::vector<struct ggml_tensor*> controls = {},
                  float control_strength                    = 0.f,
+                 struct ggml_context* persistent_work_ctx  = NULL,
                  struct ggml_tensor** output               = NULL,
                  struct ggml_context* output_ctx           = NULL,
                  std::vector<int> skip_layers              = std::vector<int>()) {
         (void)skip_layers;  // SLG doesn't work with UNet models
-        return unet.compute(n_threads, x, timesteps, context, c_concat, y, num_video_frames, controls, control_strength, output, output_ctx);
+        return unet.compute(n_threads, x, timesteps, context, c_concat, y, persistent_work_ctx, num_video_frames, controls, control_strength, output, output_ctx);
     }
-};
+};;
 
 struct MMDiTModel : public DiffusionModel {
     MMDiTRunner mmdit;
@@ -121,9 +129,11 @@ struct MMDiTModel : public DiffusionModel {
                  int num_video_frames                      = -1,
                  std::vector<struct ggml_tensor*> controls = {},
                  float control_strength                    = 0.f,
+                 struct ggml_context* persistent_work_ctx  = NULL,
                  struct ggml_tensor** output               = NULL,
                  struct ggml_context* output_ctx           = NULL,
                  std::vector<int> skip_layers              = std::vector<int>()) {
+        (void)persistent_work_ctx; // Not used by MMDiT
         return mmdit.compute(n_threads, x, timesteps, context, y, output, output_ctx, skip_layers);
     }
 };
@@ -172,9 +182,11 @@ struct FluxModel : public DiffusionModel {
                  int num_video_frames                      = -1,
                  std::vector<struct ggml_tensor*> controls = {},
                  float control_strength                    = 0.f,
+                 struct ggml_context* persistent_work_ctx  = NULL,
                  struct ggml_tensor** output               = NULL,
                  struct ggml_context* output_ctx           = NULL,
                  std::vector<int> skip_layers              = std::vector<int>()) {
+        (void)persistent_work_ctx; // Not used by Flux
         return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, output, output_ctx, skip_layers);
     }
 };

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
@@ -129,6 +129,12 @@ struct SDParams {
     float slg_scale              = 0.f;
     float skip_layer_start       = 0.01f;
     float skip_layer_end         = 0.2f;
+
+    // DeepCache parameters
+    int dc_cache_interval = 0; // 0 to disable
+    int dc_cache_depth    = 3;
+    int dc_start_steps    = 0;
+    int dc_end_steps      = 9999; // Effectively all steps
 };
 
 void print_params(SDParams params) {
@@ -178,6 +184,10 @@ void print_params(SDParams params) {
     printf("    batch_count:       %d\n", params.batch_count);
     printf("    vae_tiling:        %s\n", params.vae_tiling ? "true" : "false");
     printf("    upscale_repeats:   %d\n", params.upscale_repeats);
+    if (params.dc_cache_interval > 0) {
+        printf("    deepcache:         interval=%d, depth=%d, start=%d, end=%d\n",
+               params.dc_cache_interval, params.dc_cache_depth, params.dc_start_steps, params.dc_end_steps);
+    }
 }
 
 void print_usage(int argc, const char* argv[]) {
@@ -244,6 +254,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --control-net-cpu                  keep controlnet in cpu (for low vram)\n");
     printf("  --canny                            apply canny preprocessor (edge detection)\n");
     printf("  --color                            Colors the logging tags according to level\n");
+    printf("  --deepcache CACHE_PARAMS           Enable DeepCache for UNet. CACHE_PARAMS are comma-separated: interval,depth,start_steps,end_steps. Example: \"3,3,0,1000\"\n");
     printf("  -v, --verbose                      print extra info\n");
 }
 
@@ -629,6 +640,46 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 break;
             }
             params.skip_layer_end = std::stof(argv[i]);
+        } else if (arg == "--deepcache") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            std::string dc_params_str = argv[i];
+            std::vector<std::string> dc_tokens;
+            size_t start = 0;
+            size_t end = dc_params_str.find(',');
+            while (end != std::string::npos) {
+                dc_tokens.push_back(dc_params_str.substr(start, end - start));
+                start = end + 1;
+                end = dc_params_str.find(',', start);
+            }
+            dc_tokens.push_back(dc_params_str.substr(start));
+
+            if (dc_tokens.size() != 4) {
+                fprintf(stderr, "error: --deepcache requires 4 comma-separated values: interval,depth,start_steps,end_steps\n");
+                exit(1);
+            }
+            try {
+                params.dc_cache_interval = std::stoi(dc_tokens[0]);
+                params.dc_cache_depth = std::stoi(dc_tokens[1]);
+                params.dc_start_steps = std::stoi(dc_tokens[2]);
+                params.dc_end_steps = std::stoi(dc_tokens[3]);
+                if (params.dc_cache_interval <= 0) {
+                     fprintf(stderr, "error: deepcache interval must be > 0\n");
+                     exit(1);
+                }
+                 if (params.dc_cache_depth < 0) {
+                     fprintf(stderr, "error: deepcache depth must be >= 0\n");
+                     exit(1);
+                }
+            } catch (const std::invalid_argument& e) {
+                fprintf(stderr, "error: invalid number in --deepcache parameters: %s\n", e.what());
+                exit(1);
+            } catch (const std::out_of_range& e) {
+                fprintf(stderr, "error: number out of range in --deepcache parameters: %s\n", e.what());
+                exit(1);
+            }
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             print_usage(argc, argv);
@@ -900,7 +951,11 @@ int main(int argc, const char* argv[]) {
                                   params.clip_on_cpu,
                                   params.control_net_cpu,
                                   params.vae_on_cpu,
-                                  params.diffusion_flash_attn);
+                                  params.diffusion_flash_attn,
+                                  params.dc_cache_interval,
+                                  params.dc_cache_depth,
+                                  params.dc_start_steps,
+                                  params.dc_end_steps);
 
     if (sd_ctx == NULL) {
         printf("new_sd_ctx_t failed\n");

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -112,17 +112,32 @@ class StableDiffusionGGML {
 
     std::shared_ptr<Denoiser> denoiser = std::make_shared<CompVisDenoiser>();
 
+    // DeepCache parameters for UNet
+    int dc_cache_interval_unet_ = 0;
+    int dc_cache_depth_unet_    = 3;
+    int dc_start_steps_unet_    = 0;
+    int dc_end_steps_unet_      = 9999;
+
     StableDiffusionGGML() = default;
 
     StableDiffusionGGML(int n_threads,
                         bool vae_decode_only,
                         bool free_params_immediately,
                         std::string lora_model_dir,
-                        rng_type_t rng_type)
+                        rng_type_t rng_type,
+                        // DeepCache parameters
+                        int dc_cache_interval,
+                        int dc_cache_depth,
+                        int dc_start_steps,
+                        int dc_end_steps)
         : n_threads(n_threads),
           vae_decode_only(vae_decode_only),
           free_params_immediately(free_params_immediately),
-          lora_model_dir(lora_model_dir) {
+          lora_model_dir(lora_model_dir),
+          dc_cache_interval_unet_(dc_cache_interval),
+          dc_cache_depth_unet_(dc_cache_depth),
+          dc_start_steps_unet_(dc_start_steps),
+          dc_end_steps_unet_(dc_end_steps) {
         if (rng_type == STD_DEFAULT_RNG) {
             rng = std::make_shared<STDDefaultRNG>();
         } else if (rng_type == CUDA_RNG) {
@@ -342,7 +357,14 @@ class StableDiffusionGGML {
                 } else {
                     cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_loader.tensor_storages_types, embeddings_path, version);
                 }
-                diffusion_model = std::make_shared<UNetModel>(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn);
+                LOG_DEBUG("DeepCache: StableDiffusionGGML::load_from_file. About to create UNetModel. " \
+                          "this->dc_cache_interval_unet_: %d, this->dc_cache_depth_unet_: %d, " \
+                          "this->dc_start_steps_unet_: %d, this->dc_end_steps_unet_: %d",
+                          this->dc_cache_interval_unet_, this->dc_cache_depth_unet_,
+                          this->dc_start_steps_unet_, this->dc_end_steps_unet_);
+                diffusion_model = std::make_shared<UNetModel>(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn,
+                                                              this->dc_cache_interval_unet_, this->dc_cache_depth_unet_,
+                                                              this->dc_start_steps_unet_, this->dc_end_steps_unet_);
             }
 
             cond_stage_model->alloc_params_buffer();
@@ -617,8 +639,11 @@ class StableDiffusionGGML {
         }
 
         int64_t t0              = ggml_time_ms();
-        struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t);
-        diffusion_model->compute(n_threads, x_t, timesteps, c, concat, NULL, NULL, -1, {}, 0.f, &out);
+        struct ggml_tensor* out = NULL; // Output tensor will be allocated by compute if output_ctx is provided
+        struct ggml_tensor* out_tensor = ggml_dup_tensor(work_ctx, x_t);
+
+        // diffusion_model->compute(n_threads, x, timesteps, context, c_concat, y, guidance, num_video_frames, controls, control_strength, persistent_work_ctx, output, output_ctx, skip_layers)
+        diffusion_model->compute(n_threads, x_t, timesteps, c, concat, NULL, NULL, -1, {}, 0.f, work_ctx, &out_tensor, work_ctx);
         diffusion_model->free_compute_buffer();
 
         double result = 0.f;
@@ -890,6 +915,7 @@ class StableDiffusionGGML {
                                          -1,
                                          controls,
                                          control_strength,
+                                         work_ctx, 
                                          &out_cond);
             } else {
                 diffusion_model->compute(n_threads,
@@ -902,6 +928,7 @@ class StableDiffusionGGML {
                                          -1,
                                          controls,
                                          control_strength,
+                                         work_ctx, 
                                          &out_cond);
             }
 
@@ -922,6 +949,7 @@ class StableDiffusionGGML {
                                          -1,
                                          controls,
                                          control_strength,
+                                         work_ctx, 
                                          &out_uncond);
                 negative_data = (float*)out_uncond->data;
             }
@@ -942,6 +970,7 @@ class StableDiffusionGGML {
                                          -1,
                                          controls,
                                          control_strength,
+                                         work_ctx, 
                                          &out_skip,
                                          NULL,
                                          skip_layers);
@@ -1130,7 +1159,12 @@ sd_ctx_t* new_sd_ctx(const char* model_path_c_str,
                      bool keep_clip_on_cpu,
                      bool keep_control_net_cpu,
                      bool keep_vae_on_cpu,
-                     bool diffusion_flash_attn) {
+                     bool diffusion_flash_attn,
+                     // DeepCache parameters
+                     int dc_cache_interval,
+                     int dc_cache_depth,
+                     int dc_start_steps,
+                     int dc_end_steps) {
     sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t));
     if (sd_ctx == NULL) {
         return NULL;
@@ -1151,7 +1185,11 @@ sd_ctx_t* new_sd_ctx(const char* model_path_c_str,
                                          vae_decode_only,
                                          free_params_immediately,
                                          lora_model_dir,
-                                         rng_type);
+                                         rng_type,
+                                         dc_cache_interval,
+                                         dc_cache_depth,
+                                         dc_start_steps,
+                                         dc_end_steps);
     if (sd_ctx->sd == NULL) {
         return NULL;
     }
@@ -1439,6 +1477,15 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
         LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, batch_count, cur_seed);
 
         sd_ctx->sd->rng->manual_seed(cur_seed);
+
+        // Reset DeepCache state for the UNet model for this new image/seed
+
+        auto unet_model = std::dynamic_pointer_cast<UNetModel>(sd_ctx->sd->diffusion_model);
+        if (unet_model) {
+            unet_model->unet.unet.reset_deepcache_state();
+        }
+
+
         struct ggml_tensor* x_t   = init_latent;
         struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
         ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng);
@@ -1561,6 +1608,14 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
     if (sd_ctx->sd->stacked_id) {
         params.mem_size += static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
     }
+
+    auto unet_model = std::dynamic_pointer_cast<UNetModel>(sd_ctx->sd->diffusion_model);
+    if (unet_model && unet_model->unet.unet.dc_cache_interval_ > 0) {
+        LOG_DEBUG("Allocating extra memory for DeepCache tensor");
+        size_t cache_tensor_size = 1280 * (height/8) * (width/8) * ggml_type_size(sd_ctx->sd->model_wtype);
+        params.mem_size += cache_tensor_size;
+    }
+
     params.mem_size += width * height * 3 * sizeof(float);
     params.mem_size *= batch_count;
     params.mem_buffer = NULL;
@@ -1673,6 +1728,14 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
     if (sd_ctx->sd->stacked_id) {
         params.mem_size += static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
     }
+
+    auto unet_model = std::dynamic_pointer_cast<UNetModel>(sd_ctx->sd->diffusion_model);
+    if (unet_model && unet_model->unet.unet.dc_cache_interval_ > 0) {
+        LOG_DEBUG("Allocating extra memory for DeepCache tensor");
+        size_t cache_tensor_size = 1280 * (height/8) * (width/8) * ggml_type_size(sd_ctx->sd->model_wtype);
+        params.mem_size += cache_tensor_size;
+    }
+
     params.mem_size += width * height * 3 * sizeof(float) * 3;
     params.mem_size *= batch_count;
     params.mem_buffer = NULL;

diff --git a/stable-diffusion.h b/stable-diffusion.h
@@ -150,7 +150,12 @@ SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
                             bool keep_clip_on_cpu,
                             bool keep_control_net_cpu,
                             bool keep_vae_on_cpu,
-                            bool diffusion_flash_attn);
+                            bool diffusion_flash_attn,
+                            // DeepCache parameters
+                            int dc_cache_interval = 0,
+                            int dc_cache_depth    = 3,
+                            int dc_start_steps    = 0,
+                            int dc_end_steps      = 9999);
 
 SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);