diff --git a/assets/charts/chart_dtype_VRAM_footprint_compared.png b/assets/charts/chart_dtype_VRAM_footprint_compared.png new file mode 100644 index 0000000..e9e0396 Binary files /dev/null and b/assets/charts/chart_dtype_VRAM_footprint_compared.png differ diff --git a/assets/charts/chart_dtype_inference_and_loading_speeds_compared.png b/assets/charts/chart_dtype_inference_and_loading_speeds_compared.png new file mode 100644 index 0000000..ab5d0b4 Binary files /dev/null and b/assets/charts/chart_dtype_inference_and_loading_speeds_compared.png differ diff --git a/assets/dtype_comparison_two_images.jpg b/assets/dtype_comparison_two_images.jpg new file mode 100644 index 0000000..ffe84c4 Binary files /dev/null and b/assets/dtype_comparison_two_images.jpg differ diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index e88c84c..1c3ab70 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -1,3 +1,14 @@ +### 20.03.2024 + +* Set guidance_scale (decoder) to 1.9 and num_inference_steps to 54 for optimal image quality. +* **Key finding:** Using torch.bfloat16 for the decoder significantly increased model loading speed (3.24x faster) compared to torch.float16. "Other performance metrics remained virtually unchanged, and surprisingly, there was no perceptible difference in image quality (see [Figure 1](https://github.com/EtienneDosSantos/stable-cascade-one-click-installer/blob/dev/assets/dtype_comparison_two_images.jpg)). + * **Charts:** I've created two charts visualizing these results (see [Figure 2](https://github.com/EtienneDosSantos/stable-cascade-one-click-installer/blob/dev/assets/charts/chart_dtype_inference_and_loading_speeds_compared.png), [Figure 3](https://github.com/EtienneDosSantos/stable-cascade-one-click-installer/blob/dev/assets/charts/chart_dtype_VRAM_footprint_compared.png)). + +### 19.03.2024 + +* **[PR #7381:](https://github.com/huggingface/diffusers/pull/7381)** + * Fixed the bug so we can generate multiple images simultaneously – thx [@DN6](https://github.com/DN6)! 🎉 + ### 17.03.2024 * **[PR #31:](https://github.com/EtienneDosSantos/stable-cascade-one-click-installer/commit/e84010c83daa126b10cecae584cb8a4979689528)** diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md index f263e45..89bb76b 100644 --- a/docs/ROADMAP.md +++ b/docs/ROADMAP.md @@ -1,6 +1,23 @@ ### Features to Add: -**1. Image Metadata Storage** +**3. Test Decoder Dtype Influence** ✔️ + +* **`torch.bfloat16` vs. `torch.float16`:** + - [x] VRAM footprint + - [x] Inference speed + - [x] Image quality + +**2. Batch Size Fix (>1)** ✔️ + +* **Goal:** Restore the ability to generate multiple images per prompt. + - [x] Not getting anywhere, opened [issue #7377](https://github.com/huggingface/diffusers/issues/7377) to hopefully get this resolved. + - [x] **Issue Review:** Test provided solution to issue ([PR #7381](https://github.com/huggingface/diffusers/pull/7381))! Amazing work, thx [@DN6](https://github.com/DN6)! 🎉 +* **Troubleshooting Steps:** + - [ ] **Error Analysis:** Identify the specific error or unexpected behavior. + - [ ] **Code Review:** Examine logic related to batch size handling. + - [ ] **Dependency Check:** Ensure compatibility between any updated libraries and the batching functionality. + +**1. Image Metadata Storage** ✔️ * **Goal:** Embed essential generation parameters within generated images for reproducibility and analysis. * **Metadata to Include:** @@ -8,18 +25,10 @@ - [x] Number of steps - [x] Model name - [x] CFG value - - [ ] Sampler + - [x] Sampler - [x] Prompt * **Implementation Steps:** - **Library Selection:** Research image metadata libraries (e.g., ExifWrite, PIL/Pillow). - **Integration:** Modify image generation code to write metadata. - **Testing:** Verify metadata is written and readable. - -**2. Batch Size Fix (>1)** - -* **Goal:** Restore the ability to generate multiple images per prompt. -* **Troubleshooting Steps:** - - [ ] **Error Analysis:** Identify the specific error or unexpected behavior. - - [ ] **Code Review:** Examine logic related to batch size handling. - - [ ] **Dependency Check:** Ensure compatibility between any updated libraries and the batching functionality. diff --git a/requirements.txt b/requirements.txt index 0f736ed..34c5551 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,7 @@ --find-links https://download.pytorch.org/whl/torch_stable.html accelerate>=0.25.0 -diffusers==0.27.0 +diffusers==0.27.2 einops>=0.7.0 gradio kornia>=0.7.0 diff --git a/run.py b/run.py index 0236f6c..167406a 100644 --- a/run.py +++ b/run.py @@ -6,7 +6,7 @@ # Stability AI Non-Commercial Research Community License Agreement, dated November 28, 2023. # For more information, see https://stability.ai/use-policy. -from diffusers import StableCascadeDecoderPipeline, StableCascadePriorPipeline +from diffusers import StableCascadeDecoderPipeline, StableCascadePriorPipeline, StableCascadeUNet import gradio as gr import json import os @@ -24,9 +24,9 @@ def load_model(model_name): # Load model from disk every time it's needed if model_name == "prior": - model = StableCascadePriorPipeline.from_pretrained("stabilityai/stable-cascade-prior", variant="bf16", torch_dtype=dtype).to(device) + model = StableCascadePriorPipeline.from_pretrained("stabilityai/stable-cascade-prior", variant="bf16", torch_dtype=dtype, use_safetensors=True).to(device) elif model_name == "decoder": - model = StableCascadeDecoderPipeline.from_pretrained("stabilityai/stable-cascade", variant="bf16", torch_dtype=torch.float16).to(device) + model = StableCascadeDecoderPipeline.from_pretrained("stabilityai/stable-cascade", variant="bf16", torch_dtype=dtype, use_safetensors=True).to(device) else: raise ValueError(f"Unknown model name: {model_name}") return model @@ -79,26 +79,23 @@ def generate_images(prompt, height, width, negative_prompt, guidance_scale, num_ num_images_per_prompt=int(num_images_per_prompt), generator=generator, ) - del prior # Explicitly delete the model to help with memory management - torch.cuda.empty_cache() # Clear the CUDA cache to free up unused memory # Load, use, and discard the decoder model decoder = load_model("decoder") decoder.enable_model_cpu_offload() decoder_output = decoder( - image_embeddings=prior_output.image_embeddings.to(torch.float16), + image_embeddings=prior_output.image_embeddings.to(dtype), prompt=cleaned_prompt, negative_prompt=negative_prompt, - guidance_scale=0.0, + guidance_scale=1.9, # Guidance scale is enabled by setting guidance_scale > 1 num_inference_steps=calculated_steps_decoder, output_type="pil", generator=generator, ).images - del decoder # Explicitly delete the model to help with memory management - torch.cuda.empty_cache() # Clear the CUDA cache to free up unused memory - + metadata_embedded = { "parameters": "Stable Cascade", + "scheduler": "DDPMWuerstchenScheduler", "prompt": cleaned_prompt, "negative_prompt": negative_prompt, "width": int(width), @@ -190,8 +187,8 @@ def configure_ui(): height = gr.Slider(minimum=512, maximum=2048, step=1, value=1024, label="Image Height") with gr.Column(): # components in central column - num_inference_steps = gr.Slider(minimum=1, maximum=150, step=1, value=30, label="Steps") - num_images_per_prompt = gr.Number(label="Number of Images per Prompt (Currently, the system can only generate one image at a time. Please leave the 'Images per Prompt' setting at 1 until this issue is fixed.)", value=1) + num_inference_steps = gr.Slider(minimum=1, maximum=150, step=1, value=54, label="Steps") + num_images_per_prompt = gr.Number(label="Number of Images per Prompt", value=2) with gr.Column(): # components in right column guidance_scale = gr.Slider(minimum=1, maximum=20, step=0.5, value=4.0, label="Guidance Scale")