Merge branch 'main' into tests-encode-prompt

huggingface · Feb 11, 2025 · e2f34ad · e2f34ad
2 parents a4a917e + 8ae8008
commit e2f34ad
Show file tree

Hide file tree

Showing 23 changed files with 842 additions and 213 deletions.
diff --git a/docs/source/en/using-diffusers/img2img.md b/docs/source/en/using-diffusers/img2img.md
@@ -461,12 +461,12 @@ Chain it to an upscaler pipeline to increase the image resolution:
 from diffusers import StableDiffusionLatentUpscalePipeline
 
 upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained(
-    "stabilityai/sd-x2-latent-upscaler", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+    "stabilityai/sd-x2-latent-upscaler", torch_dtype=torch.float16, use_safetensors=True
 )
 upscaler.enable_model_cpu_offload()
 upscaler.enable_xformers_memory_efficient_attention()
 
-image_2 = upscaler(prompt, image=image_1, output_type="latent").images[0]
+image_2 = upscaler(prompt, image=image_1).images[0]
 ```
 
 Finally, chain it to a super-resolution pipeline to further enhance the resolution:

diff --git a/docs/source/en/using-diffusers/write_own_pipeline.md b/docs/source/en/using-diffusers/write_own_pipeline.md
@@ -106,7 +106,7 @@ Let's try it out!
 
 ## Deconstruct the Stable Diffusion pipeline
 
-Stable Diffusion is a text-to-image *latent diffusion* model. It is called a latent diffusion model because it works with a lower-dimensional representation of the image instead of the actual pixel space, which makes it more memory efficient. The encoder compresses the image into a smaller representation, and a decoder to convert the compressed representation back into an image. For text-to-image models, you'll need a tokenizer and an encoder to generate text embeddings. From the previous example, you already know you need a UNet model and a scheduler.
+Stable Diffusion is a text-to-image *latent diffusion* model. It is called a latent diffusion model because it works with a lower-dimensional representation of the image instead of the actual pixel space, which makes it more memory efficient. The encoder compresses the image into a smaller representation, and a decoder converts the compressed representation back into an image. For text-to-image models, you'll need a tokenizer and an encoder to generate text embeddings. From the previous example, you already know you need a UNet model and a scheduler.
 
 As you can see, this is already more complex than the DDPM pipeline which only contains a UNet model. The Stable Diffusion model has three separate pretrained models.
 

diff --git a/examples/community/README.md b/examples/community/README.md
diff --git a/examples/dreambooth/train_dreambooth_lora_sana.py b/examples/dreambooth/train_dreambooth_lora_sana.py
@@ -995,7 +995,8 @@ def main(args):
     if args.enable_npu_flash_attention:
         if is_torch_npu_available():
             logger.info("npu flash attention enabled.")
-            transformer.enable_npu_flash_attention()
+            for block in transformer.transformer_blocks:
+                block.attn2.set_use_npu_flash_attention(True)
         else:
             raise ValueError("npu flash attention requires torch_npu extensions and is supported only on npu device ")
 

diff --git a/examples/instruct_pix2pix/train_instruct_pix2pix.py b/examples/instruct_pix2pix/train_instruct_pix2pix.py
@@ -695,7 +695,7 @@ def preprocess_images(examples):
         )
         # We need to ensure that the original and the edited images undergo the same
         # augmentation transforms.
-        images = np.concatenate([original_images, edited_images])
+        images = np.stack([original_images, edited_images])
         images = torch.tensor(images)
         images = 2 * (images / 255) - 1
         return train_transforms(images)
@@ -706,7 +706,7 @@ def preprocess_train(examples):
         # Since the original and edited images were concatenated before
         # applying the transformations, we need to separate them and reshape
         # them accordingly.
-        original_images, edited_images = preprocessed_images.chunk(2)
+        original_images, edited_images = preprocessed_images
         original_images = original_images.reshape(-1, 3, args.resolution, args.resolution)
         edited_images = edited_images.reshape(-1, 3, args.resolution, args.resolution)
 

diff --git a/examples/instruct_pix2pix/train_instruct_pix2pix_sdxl.py b/examples/instruct_pix2pix/train_instruct_pix2pix_sdxl.py
@@ -766,7 +766,7 @@ def preprocess_images(examples):
         )
         # We need to ensure that the original and the edited images undergo the same
         # augmentation transforms.
-        images = np.concatenate([original_images, edited_images])
+        images = np.stack([original_images, edited_images])
         images = torch.tensor(images)
         images = 2 * (images / 255) - 1
         return train_transforms(images)
@@ -906,7 +906,7 @@ def preprocess_train(examples):
         # Since the original and edited images were concatenated before
         # applying the transformations, we need to separate them and reshape
         # them accordingly.
-        original_images, edited_images = preprocessed_images.chunk(2)
+        original_images, edited_images = preprocessed_images
         original_images = original_images.reshape(-1, 3, args.resolution, args.resolution)
         edited_images = edited_images.reshape(-1, 3, args.resolution, args.resolution)
 

diff --git a/examples/model_search/README.md b/examples/model_search/README.md
@@ -82,31 +82,11 @@ pipeline = EasyPipelineForInpainting.from_huggingface(
 ## Search Civitai and Huggingface
 
 ```python
-from pipeline_easy import (
-    search_huggingface,
-    search_civitai,
-) 
-
-# Search Lora
-Lora = search_civitai(
-    "Keyword_to_search_Lora",
-    model_type="LORA",
-    base_model = "SD 1.5",
-    download=True,
-    )
 # Load Lora into the pipeline.
-pipeline.load_lora_weights(Lora)
-
+pipeline.auto_load_lora_weights("Detail Tweaker")
 
-# Search TextualInversion
-TextualInversion = search_civitai(
-    "EasyNegative",
-    model_type="TextualInversion",
-    base_model = "SD 1.5",
-    download=True
-)
 # Load TextualInversion into the pipeline.
-pipeline.load_textual_inversion(TextualInversion, token="EasyNegative")
+pipeline.auto_load_textual_inversion("EasyNegative", token="EasyNegative")
 ```
 
 ### Search Civitai