mkshing · Apr 12, 2023
diff --git a/‎README.md
+90-12 b/‎README.md
+90-12
diff --git a/‎assets/chair-result.png
490 KB b/‎assets/chair-result.png
490 KB
diff --git a/‎inference.py
+45-2 b/‎inference.py
+45-2
diff --git a/‎requirements.txt
+1-1 b/‎requirements.txt
+1-1
diff --git a/‎scripts/svdiff_pytorch.ipynb
+1,681-260 b/‎scripts/svdiff_pytorch.ipynb
+1,681-260
diff --git a/‎setup.py
+1-1 b/‎setup.py
+1-1
diff --git a/‎svdiff_pytorch/__init__.py
+3-1 b/‎svdiff_pytorch/__init__.py
+3-1
diff --git a/‎svdiff_pytorch/diffusers_models/attention.py
+7-7 b/‎svdiff_pytorch/diffusers_models/attention.py
+7-7
diff --git a/‎svdiff_pytorch/diffusers_models/cross_attention.py
+3-3 b/‎svdiff_pytorch/diffusers_models/cross_attention.py
+3-3
diff --git a/‎svdiff_pytorch/diffusers_models/embeddings.py
+1-1 b/‎svdiff_pytorch/diffusers_models/embeddings.py
+1-1
diff --git a/‎svdiff_pytorch/diffusers_models/resnet.py
+4-4 b/‎svdiff_pytorch/diffusers_models/resnet.py
+4-4
diff --git a/‎svdiff_pytorch/diffusers_models/transformer_2d.py
+4-4 b/‎svdiff_pytorch/diffusers_models/transformer_2d.py
+4-4
diff --git a/‎svdiff_pytorch/diffusers_models/unet_2d_blocks.py
+3-3 b/‎svdiff_pytorch/diffusers_models/unet_2d_blocks.py
+3-3
diff --git a/‎svdiff_pytorch/diffusers_models/unet_2d_condition.py
+2-2 b/‎svdiff_pytorch/diffusers_models/unet_2d_condition.py
+2-2
diff --git a/‎svdiff_pytorch/layers.py
+122-16 b/‎svdiff_pytorch/layers.py
+122-16
diff --git a/‎svdiff_pytorch/pipeline_stable_diffusion_ddim_inversion.py
+250 b/‎svdiff_pytorch/pipeline_stable_diffusion_ddim_inversion.py
+250
diff --git a/‎svdiff_pytorch/transformers_models_clip/__init__.py
+2 b/‎svdiff_pytorch/transformers_models_clip/__init__.py
+2
diff --git a/‎svdiff_pytorch/transformers_models_clip/modeling_clip.py
+1,325 b/‎svdiff_pytorch/transformers_models_clip/modeling_clip.py
+1,325
diff --git a/‎svdiff_pytorch/utils.py
+75-5 b/‎svdiff_pytorch/utils.py
+75-5
diff --git a/‎train_svdiff.py
+108-64 b/‎train_svdiff.py
+108-64
@@ -12,10 +12,17 @@ My summary tweet is found [here](https://twitter.com/mk1stats/status/16428655051
 left: LoRA, right: SVDiff
 
 
-Compared with LoRA, the number of trainable parameters is 0.6 M less parameters and the file size is only <1MB (LoRA: 3.1MB)!!
+Compared with LoRA, the number of trainable parameters is 0.5 M less parameters and the file size is only 1.2MB (LoRA: 3.1MB)!!
 
 ![kumamon](assets/kumamon.png)
 
+## Updates
+### 2023.4.11
+- Released v0.2.0 (please see [here](https://github.com/mkshing/svdiff-pytorch/releases/tag/v0.2.0) for the details)
+- Add [Single Image Editing](#single-image-editing)
+  ![chair-result](assets/chair-result.png)
+  <br>"photo of a ~~pink~~ blue chair with black legs"
+
 ## Installation 
 ```
 $ pip install svdiff-pytorch
@@ -26,9 +33,10 @@ $ git clone https://github.com/mkshing/svdiff-pytorch
 $ pip install -r requirements.txt
 ```
 
-## Training
-The following example script is for "Single-Subject Generation", which is a domain-tuning on a single object or concept (using 3-5 images). (See Section 4.1)
+## Single-Subject Generation
+"Single-Subject Generation" is a domain-tuning on a single object or concept (using 3-5 images). (See Section 4.1)
 
+### Training
 According to the paper, the learning rate for SVDiff needs to be 1000 times larger than the lr used for fine-tuning. 
 
 ```bash
@@ -48,29 +56,32 @@ accelerate launch train_svdiff.py \
   --resolution=512 \
   --train_batch_size=1 \
   --gradient_accumulation_steps=1 \
-  --learning_rate=5e-3 \
+  --learning_rate=1e-3 \
+  --learning_rate_1d=1e-6 \
+  --train_text_encoder \
   --lr_scheduler="constant" \
   --lr_warmup_steps=0 \
   --num_class_images=200 \
-  --max_train_steps=800
+  --max_train_steps=500
 ```
 
-
-## Inference
+### Inference
 
 ```python
 from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
 import torch
 
-from svdiff_pytorch import load_unet_for_svdiff
+from svdiff_pytorch import load_unet_for_svdiff, load_text_encoder_for_svdiff
 
 pretrained_model_name_or_path = "runwayml/stable-diffusion-v1-5"
-spectral_shifts_ckpt = "spectral_shifts.safetensors-path"
-unet = load_unet_for_svdiff(pretrained_model_name_or_path, spectral_shifts_ckpt=spectral_shifts_ckpt, subfolder="unet")
+spectral_shifts_ckpt_dir = "ckpt-dir-path"
+unet = load_unet_for_svdiff(pretrained_model_name_or_path, spectral_shifts_ckpt=spectral_shifts_ckpt_dir, subfolder="unet")
+text_encoder = load_text_encoder_for_svdiff(pretrained_model_name_or_path, spectral_shifts_ckpt=spectral_shifts_ckpt_dir, subfolder="text_encoder")
 # load pipe
 pipe = StableDiffusionPipeline.from_pretrained(
     pretrained_model_name_or_path,
     unet=unet,
+    text_encoder=text_encoder,
 )
 pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
 pipe.to("cuda")
@@ -82,14 +93,14 @@ You can use the following CLI too! Once it's done, you will see `grid.png` for t
 ```bash
 python inference.py \
   --pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5"  \
-  --spectral_shifts_ckpt="spectral_shifts.safetensors-path"  \
+  --spectral_shifts_ckpt="ckpt-dir-path"  \
   --prompt="A picture of a sks dog in a bucket"  \
   --scheduler_type="dpm_solver++"  \
   --num_inference_steps=25  \
   --num_images_per_prompt=2 
 ```
 
-## Gradio
+### Gradio
 You can also try SVDiff-pytorch in a UI with [gradio](https://gradio.app/). This demo supports both training and inference!
 
 [![Open in Spaces](https://huggingface.co/datasets/huggingface/badges/raw/main/open-in-hf-spaces-sm.svg)](https://huggingface.co/spaces/svdiff-library/SVDiff-Training-UI)
@@ -103,7 +114,73 @@ $ export HF_TOKEN="YOUR_HF_TOKEN_HERE"
 $ python app.py
 ```
 
+## Single Image Editing
+### Training
+In Single Image Editing, your instance prompt should be just the description of your input image **without the identifier**.
+
+```bash
+export MODEL_NAME="runwayml/stable-diffusion-v1-5"
+export INSTANCE_DIR="dir-path-to-input-image"
+export CLASS_DIR="path-to-class-images"
+export OUTPUT_DIR="path-to-save-model"
+
+accelerate launch train_svdiff.py \
+  --pretrained_model_name_or_path=$MODEL_NAME  \
+  --instance_data_dir=$INSTANCE_DIR \
+  --class_data_dir=$CLASS_DIR \
+  --output_dir=$OUTPUT_DIR \
+  --with_prior_preservation --prior_loss_weight=1.0 \
+  --instance_prompt="photo of a pink chair with black legs" \
+  --class_prompt="photo of a chair" \
+  --resolution=512 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=1 \
+  --learning_rate=1e-3 \
+  --learning_rate_1d=1e-6 \
+  --train_text_encoder \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --num_class_images=200 \
+  --max_train_steps=500
+```
+
+### Inference
+
+```python
+import torch
+from PIL import Image
+from diffusers import DDIMScheduler
+from svdiff_pytorch import load_unet_for_svdiff, load_text_encoder_for_svdiff, StableDiffusionPipelineWithDDIMInversion
+
+pretrained_model_name_or_path = "runwayml/stable-diffusion-v1-5"
+spectral_shifts_ckpt_dir = "ckpt-dir-path"
+image = "path-to-image"
+source_prompt = "prompt-for-image"
+target_prompt = "prompt-you-want-to-generate"
+
+unet = load_unet_for_svdiff(pretrained_model_name_or_path, spectral_shifts_ckpt=spectral_shifts_ckpt_dir, subfolder="unet")
+text_encoder = load_text_encoder_for_svdiff(pretrained_model_name_or_path, spectral_shifts_ckpt=spectral_shifts_ckpt_dir, subfolder="text_encoder")
+# load pipe
+pipe = StableDiffusionPipelineWithDDIMInversion.from_pretrained(
+    pretrained_model_name_or_path,
+    unet=unet,
+    text_encoder=text_encoder,
+)
+pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+pipe.to("cuda")
+
+# (optional) ddim inversion
+# if you don't do it, inv_latents = None
+image = Image.open(image).convert("RGB").resize((512, 512))
+# in SVDiff, they use guidance scale=1 in ddim inversion
+inv_latents = pipe.invert(source_prompt, image=image, guidance_scale=1.0).latents
+
+image = pipe(target_prompt, latents=inv_latents).images[0]
+```
+
+
 ## Additional Features
+
 ### Spectral Shift Scaling
 
 ![scale](assets/scale.png)
@@ -165,6 +242,7 @@ And, add `--enable_tome_merging` to your training arguments!
 - [x] Training
 - [x] Inference
 - [x] Scaling spectral shifts
+- [x] Support Single Image Editing
 - [ ] Support multiple spectral shifts (Section 3.2)
 - [ ] Cut-Mix-Unmix (Section 3.3)
 - [ ] SVDiff + LoRA
@@ -1,10 +1,13 @@
 import argparse
+import os
 from tqdm import tqdm
 import random
 import torch
+import huggingface_hub
+from transformers import CLIPTextModel
 from diffusers import StableDiffusionPipeline
 from diffusers.utils import is_xformers_available
-from svdiff_pytorch import load_unet_for_svdiff, SCHEDULER_MAPPING, image_grid
+from svdiff_pytorch import load_unet_for_svdiff, load_text_encoder_for_svdiff, SCHEDULER_MAPPING, image_grid
 
 
 def parse_args():
@@ -14,7 +17,7 @@ def parse_args():
     # diffusers config
     parser.add_argument("--prompt", type=str, nargs="?", default="a photo of *s", help="the prompt to render")
     parser.add_argument("--num_inference_steps", type=int, default=50, help="number of sampling steps")
-    parser.add_argument("--guidance_scale", type=float, default=1.0, help="unconditional guidance scale")
+    parser.add_argument("--guidance_scale", type=float, default=7.5, help="unconditional guidance scale")
     parser.add_argument("--num_images_per_prompt", type=int, default=1, help="number of images per prompt")
     parser.add_argument("--height", type=int, default=512, help="image height, in pixel space",)
     parser.add_argument("--width", type=int, default=512, help="image width, in pixel space",)
@@ -27,6 +30,33 @@ def parse_args():
     return args
 
 
+def load_text_encoder(pretrained_model_name_or_path, spectral_shifts_ckpt, device, fp16=False):
+    if os.path.isdir(spectral_shifts_ckpt):
+        spectral_shifts_ckpt = os.path.join(spectral_shifts_ckpt, "spectral_shifts_te.safetensors")
+    elif not os.path.exists(spectral_shifts_ckpt):
+        # download from hub
+        hf_hub_kwargs = {} if hf_hub_kwargs is None else hf_hub_kwargs
+        try:
+            spectral_shifts_ckpt = huggingface_hub.hf_hub_download(spectral_shifts_ckpt, filename="spectral_shifts_te.safetensors", **hf_hub_kwargs)
+        except huggingface_hub.utils.EntryNotFoundError:
+            return CLIPTextModel.from_pretrained(pretrained_model_name_or_path, subfolder="text_encoder", torch_dtype=torch.float16 if fp16 else None).to(device)
+    if not os.path.exists(spectral_shifts_ckpt):
+            return CLIPTextModel.from_pretrained(pretrained_model_name_or_path, subfolder="text_encoder", torch_dtype=torch.float16 if fp16 else None).to(device)
+    text_encoder = load_text_encoder_for_svdiff(
+        pretrained_model_name_or_path=pretrained_model_name_or_path,
+        spectral_shifts_ckpt=spectral_shifts_ckpt,
+        subfolder="text_encoder", 
+    )
+    # first perform svd and cache
+    for module in text_encoder.modules():
+        if hasattr(module, "perform_svd"):
+            module.perform_svd()
+    if fp16:
+        text_encoder = text_encoder.to(device, dtype=torch.float16)
+    return text_encoder
+
+
+
 def main():
     args = parse_args()
     device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -40,10 +70,18 @@ def main():
             module.perform_svd()
     if args.fp16:
         unet = unet.to(device, dtype=torch.float16)
+    text_encoder = load_text_encoder(
+        pretrained_model_name_or_path=args.pretrained_model_name_or_path, 
+        spectral_shifts_ckpt=args.spectral_shifts_ckpt, 
+        fp16=args.fp16,
+        device=device
+    )
+
     # load pipe
     pipe = StableDiffusionPipeline.from_pretrained(
         args.pretrained_model_name_or_path,
         unet=unet,
+        text_encoder=text_encoder,
         requires_safety_checker=False,
         safety_checker=None,
         feature_extractor=None,
@@ -67,6 +105,11 @@ def main():
         for module in pipe.unet.modules():
             if hasattr(module, "set_scale"):
                 module.set_scale(scale=args.spectral_shifts_scale)
+        if not isinstance(pipe.text_encoder, CLIPTextModel):
+            for module in pipe.text_encoder.modules():
+                if hasattr(module, "set_scale"):
+                    module.set_scale(scale=args.spectral_shifts_scale)
+
         print(f"Set spectral_shifts_scale to {args.spectral_shifts_scale}!")
 
     if args.seed == "random_seed":
 
@@ -2,7 +2,7 @@ diffusers==0.14.0
 accelerate
 torchvision
 safetensors
-transformers>=4.25.1
+transformers>=4.25.1, <=4.27.3
 ftfy
 tensorboard
 Jinja2
 
@@ -3,7 +3,7 @@
 
 setup(
     name="svdiff-pytorch",
-    version="0.1.1",
+    version="0.2.0",
     author="Makoto Shing",
     url="https://github.com/mkshing/svdiff-pytorch",
     description="Implementation of 'SVDiff: Compact Parameter Space for Diffusion Fine-Tuning'",
 
@@ -1,2 +1,4 @@
 from svdiff_pytorch.diffusers_models.unet_2d_condition import UNet2DConditionModel as UNet2DConditionModelForSVDiff
-from svdiff_pytorch.utils import load_unet_for_svdiff, image_grid, SCHEDULER_MAPPING
+from svdiff_pytorch.transformers_models_clip.modeling_clip import CLIPTextModel as CLIPTextModelForSVDiff
+from svdiff_pytorch.utils import load_unet_for_svdiff, load_text_encoder_for_svdiff, image_grid, SCHEDULER_MAPPING
+from svdiff_pytorch.pipeline_stable_diffusion_ddim_inversion import StableDiffusionPipelineWithDDIMInversion
@@ -21,7 +21,7 @@
 from diffusers.utils.import_utils import is_xformers_available
 from svdiff_pytorch.diffusers_models.cross_attention import CrossAttention
 from diffusers.models.embeddings import CombinedTimestepLabelEmbeddings
-from svdiff_pytorch.layers import SVDLinear
+from svdiff_pytorch.layers import SVDLinear, SVDGroupNorm, SVDLayerNorm
 
 
 if is_xformers_available():
@@ -62,7 +62,7 @@ def __init__(
 
         self.num_heads = channels // num_head_channels if num_head_channels is not None else 1
         self.num_head_size = num_head_channels
-        self.group_norm = nn.GroupNorm(num_channels=channels, num_groups=norm_num_groups, eps=eps, affine=True)
+        self.group_norm = SVDGroupNorm(num_channels=channels, num_groups=norm_num_groups, eps=eps, affine=True)
 
         # define q,k,v as linear layers
         self.query = SVDLinear(channels, channels)
@@ -252,7 +252,7 @@ def __init__(
         elif self.use_ada_layer_norm_zero:
             self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
         else:
-            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+            self.norm1 = SVDLayerNorm(dim, elementwise_affine=norm_elementwise_affine)
 
         if cross_attention_dim is not None:
             # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
@@ -261,13 +261,13 @@ def __init__(
             self.norm2 = (
                 AdaLayerNorm(dim, num_embeds_ada_norm)
                 if self.use_ada_layer_norm
-                else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+                else SVDLayerNorm(dim, elementwise_affine=norm_elementwise_affine)
             )
         else:
             self.norm2 = None
 
         # 3. Feed-forward
-        self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.norm3 = SVDLayerNorm(dim, elementwise_affine=norm_elementwise_affine)
 
     def forward(
         self,
@@ -453,7 +453,7 @@ def __init__(self, embedding_dim, num_embeddings):
         self.emb = nn.Embedding(num_embeddings, embedding_dim)
         self.silu = nn.SiLU()
         self.linear = SVDLinear(embedding_dim, embedding_dim * 2)
-        self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False)
+        self.norm = SVDLayerNorm(embedding_dim, elementwise_affine=False)
 
     def forward(self, x, timestep):
         emb = self.linear(self.silu(self.emb(timestep)))
@@ -474,7 +474,7 @@ def __init__(self, embedding_dim, num_embeddings):
 
         self.silu = nn.SiLU()
         self.linear = SVDLinear(embedding_dim, 6 * embedding_dim, bias=True)
-        self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
+        self.norm = SVDLayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
 
     def forward(self, x, timestep, class_labels, hidden_dtype=None):
         emb = self.linear(self.silu(self.emb(timestep, class_labels, hidden_dtype=hidden_dtype)))
 
@@ -19,7 +19,7 @@
 
 from diffusers.utils import deprecate, logging
 from diffusers.utils.import_utils import is_xformers_available
-from svdiff_pytorch.layers import SVDLinear
+from svdiff_pytorch.layers import SVDLinear, SVDGroupNorm, SVDLayerNorm
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -80,12 +80,12 @@ def __init__(
         self.added_kv_proj_dim = added_kv_proj_dim
 
         if norm_num_groups is not None:
-            self.group_norm = nn.GroupNorm(num_channels=inner_dim, num_groups=norm_num_groups, eps=1e-5, affine=True)
+            self.group_norm = SVDGroupNorm(num_channels=inner_dim, num_groups=norm_num_groups, eps=1e-5, affine=True)
         else:
             self.group_norm = None
 
         if cross_attention_norm:
-            self.norm_cross = nn.LayerNorm(cross_attention_dim)
+            self.norm_cross = SVDLayerNorm(cross_attention_dim)
 
         self.to_q = SVDLinear(query_dim, inner_dim, bias=bias)
         self.to_k = SVDLinear(cross_attention_dim, inner_dim, bias=bias)
 
@@ -137,7 +137,7 @@ def __init__(
             in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias
         )
         if layer_norm:
-            self.norm = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
+            self.norm = SVDLayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
         else:
             self.norm = None
 
 
@@ -6,7 +6,7 @@
 import torch.nn.functional as F
 
 from svdiff_pytorch.diffusers_models.attention import AdaGroupNorm
-from svdiff_pytorch.layers import SVDConv1d, SVDConv2d, SVDLinear
+from svdiff_pytorch.layers import SVDConv1d, SVDConv2d, SVDLinear, SVDGroupNorm, SVDLayerNorm
 
 
 class Upsample1D(nn.Module):
@@ -472,7 +472,7 @@ def __init__(
         if self.time_embedding_norm == "ada_group":
             self.norm1 = AdaGroupNorm(temb_channels, in_channels, groups, eps=eps)
         else:
-            self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
+            self.norm1 = SVDGroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
 
         self.conv1 = SVDConv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
 
@@ -491,7 +491,7 @@ def __init__(
         if self.time_embedding_norm == "ada_group":
             self.norm2 = AdaGroupNorm(temb_channels, out_channels, groups_out, eps=eps)
         else:
-            self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
+            self.norm2 = SVDGroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
 
         self.dropout = torch.nn.Dropout(dropout)
         conv_2d_out_channels = conv_2d_out_channels or out_channels
@@ -609,7 +609,7 @@ def __init__(self, inp_channels, out_channels, kernel_size, n_groups=8):
         super().__init__()
 
         self.conv1d = SVDConv1d(inp_channels, out_channels, kernel_size, padding=kernel_size // 2)
-        self.group_norm = nn.GroupNorm(n_groups, out_channels)
+        self.group_norm = SVDGroupNorm(n_groups, out_channels)
         self.mish = nn.Mish()
 
     def forward(self, x):
 
@@ -24,7 +24,7 @@
 from svdiff_pytorch.diffusers_models.attention import BasicTransformerBlock
 from diffusers.models.embeddings import PatchEmbed
 from diffusers.models.modeling_utils import ModelMixin
-from svdiff_pytorch.layers import SVDConv1d, SVDConv2d, SVDLinear
+from svdiff_pytorch.layers import SVDConv1d, SVDConv2d, SVDLinear, SVDGroupNorm, SVDLayerNorm
 
 
 @dataclass
@@ -143,7 +143,7 @@ def __init__(
         if self.is_input_continuous:
             self.in_channels = in_channels
 
-            self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+            self.norm = SVDGroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
             if use_linear_projection:
                 self.proj_in = SVDLinear(in_channels, inner_dim)
             else:
@@ -205,10 +205,10 @@ def __init__(
             else:
                 self.proj_out = SVDConv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
         elif self.is_input_vectorized:
-            self.norm_out = nn.LayerNorm(inner_dim)
+            self.norm_out = SVDLayerNorm(inner_dim)
             self.out = SVDLinear(inner_dim, self.num_vector_embeds - 1)
         elif self.is_input_patches:
-            self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
+            self.norm_out = SVDLayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
             self.proj_out_1 = SVDLinear(inner_dim, 2 * inner_dim)
             self.proj_out_2 = SVDLinear(inner_dim, patch_size * patch_size * self.out_channels)
 
 
@@ -22,7 +22,7 @@
 from svdiff_pytorch.diffusers_models.dual_transformer_2d import DualTransformer2DModel
 from svdiff_pytorch.diffusers_models.resnet import Downsample2D, FirDownsample2D, FirUpsample2D, KDownsample2D, KUpsample2D, ResnetBlock2D, Upsample2D
 from svdiff_pytorch.diffusers_models.transformer_2d import Transformer2DModel
-from svdiff_pytorch.layers import SVDConv1d, SVDConv2d, SVDLinear
+from svdiff_pytorch.layers import SVDConv1d, SVDConv2d, SVDLinear, SVDLayerNorm, SVDGroupNorm
 
 
 def get_down_block(
@@ -2089,7 +2089,7 @@ def __init__(
                 kernel="fir",
             )
             self.skip_conv = SVDConv2d(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
-            self.skip_norm = torch.nn.GroupNorm(
+            self.skip_norm = SVDGroupNorm(
                 num_groups=min(out_channels // 4, 32), num_channels=out_channels, eps=resnet_eps, affine=True
             )
             self.act = nn.SiLU()
@@ -2186,7 +2186,7 @@ def __init__(
                 kernel="fir",
             )
             self.skip_conv = SVDConv2d(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
-            self.skip_norm = torch.nn.GroupNorm(
+            self.skip_norm = SVDGroupNorm(
                 num_groups=min(out_channels // 4, 32), num_channels=out_channels, eps=resnet_eps, affine=True
             )
             self.act = nn.SiLU()
 
@@ -34,7 +34,7 @@
     get_down_block,
     get_up_block,
 )
-from svdiff_pytorch.layers import SVDConv1d, SVDConv2d, SVDLinear
+from svdiff_pytorch.layers import SVDConv1d, SVDConv2d, SVDLinear, SVDGroupNorm, SVDLayerNorm
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -348,7 +348,7 @@ def __init__(
 
         # out
         if norm_num_groups is not None:
-            self.conv_norm_out = nn.GroupNorm(
+            self.conv_norm_out = SVDGroupNorm(
                 num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
             )
             self.conv_act = nn.SiLU()
 
@@ -17,12 +17,9 @@ def __init__(
         nn.Conv2d.__init__(self, in_channels, out_channels, kernel_size, **kwargs)
         assert type(kernel_size) is int
         weight_reshaped = rearrange(self.weight, 'co cin h w -> co (cin h w)')
-        U, S, Vh = torch.linalg.svd(weight_reshaped, full_matrices=False)
-        self.U = U
-        self.S = S
-        self.Vh = Vh
+        self.U, self.S, self.Vh = torch.linalg.svd(weight_reshaped, full_matrices=False)
         # initialize to 0 for smooth tuning 
-        self.delta = nn.Parameter(torch.zeros_like(S))
+        self.delta = nn.Parameter(torch.zeros_like(self.S))
         self.weight.requires_grad = False
         self.done_svd = False
         self.scale = scale
@@ -63,12 +60,9 @@ def __init__(
         nn.Conv1d.__init__(self, in_channels, out_channels, kernel_size, **kwargs)
         assert type(kernel_size) is int
         weight_reshaped = rearrange(self.weight, 'co cin h w -> co (cin h w)')
-        U, S, Vh = torch.linalg.svd(weight_reshaped, full_matrices=False)
-        self.U = U
-        self.S = S
-        self.Vh = Vh
+        self.U, self.S, self.Vh = torch.linalg.svd(weight_reshaped, full_matrices=False)
         # initialize to 0 for smooth tuning 
-        self.delta = nn.Parameter(torch.zeros_like(S))
+        self.delta = nn.Parameter(torch.zeros_like(self.S))
         self.weight.requires_grad = False
         self.done_svd = False
         self.scale = scale
@@ -107,12 +101,9 @@ def __init__(
         **kwargs
     ):
         nn.Linear.__init__(self, in_features, out_features, **kwargs)
-        U, S, Vh = torch.linalg.svd(self.weight, full_matrices=False)
-        self.U = U
-        self.S = S
-        self.Vh = Vh
+        self.U, self.S, self.Vh = torch.linalg.svd(self.weight, full_matrices=False)        
         # initialize to 0 for smooth tuning 
-        self.delta = nn.Parameter(torch.zeros_like(S))
+        self.delta = nn.Parameter(torch.zeros_like(self.S))
         self.weight.requires_grad = False
         self.done_svd = False
         self.scale = scale
@@ -135,4 +126,119 @@ def forward(self, x: torch.Tensor):
             # this happens after loading the state dict 
             self.perform_svd()
         weight_updated = self.U.to(x.device, dtype=x.dtype) @ torch.diag(F.relu(self.S.to(x.device, dtype=x.dtype)+self.scale * self.delta)) @ self.Vh.to(x.device, dtype=x.dtype)
-        return F.linear(x, weight_updated, bias=self.bias)
+        return F.linear(x, weight_updated, bias=self.bias)
+    
+
+class SVDEmbedding(nn.Embedding):
+    # LoRA implemented in a dense layer
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        scale: float = 1.0,
+        **kwargs
+    ):
+        nn.Embedding.__init__(self, num_embeddings, embedding_dim, **kwargs)
+        self.U, self.S, self.Vh = torch.linalg.svd(self.weight, full_matrices=False)
+        # initialize to 0 for smooth tuning 
+        self.delta = nn.Parameter(torch.zeros_like(self.S))
+        self.weight.requires_grad = False
+        self.done_svd = False
+        self.scale = scale
+        self.reset_parameters()
+    
+    def set_scale(self, scale: float):
+        self.scale = scale
+
+    def perform_svd(self):
+        self.U, self.S, self.Vh = torch.linalg.svd(self.weight, full_matrices=False)
+        self.done_svd = True    
+
+    def reset_parameters(self):
+        nn.Embedding.reset_parameters(self)
+        if hasattr(self, 'delta'):
+            nn.init.zeros_(self.delta)
+
+    def forward(self, x: torch.Tensor):
+        if not self.done_svd:
+            # this happens after loading the state dict 
+            self.perform_svd()
+        weight_updated = self.U.to(x.device) @ torch.diag(F.relu(self.S.to(x.device)+self.scale * self.delta)) @ self.Vh.to(x.device)
+        return F.embedding(x, weight_updated, padding_idx=self.padding_idx, max_norm=self.max_norm, norm_type=self.norm_type, scale_grad_by_freq=self.scale_grad_by_freq, sparse=self.sparse)
+    
+
+# 1-D
+class SVDLayerNorm(nn.LayerNorm):
+    def __init__(
+        self, 
+        normalized_shape: int, 
+        scale: float = 1.0,
+        **kwargs
+    ):
+        nn.LayerNorm.__init__(self, normalized_shape=normalized_shape, **kwargs)
+        self.U, self.S, self.Vh = torch.linalg.svd(self.weight.unsqueeze(0), full_matrices=False)
+        # initialize to 0 for smooth tuning 
+        self.delta = nn.Parameter(torch.zeros_like(self.S))
+        self.weight.requires_grad = False
+        self.done_svd = False
+        self.scale = scale
+        self.reset_parameters()
+    
+    def set_scale(self, scale: float):
+        self.scale = scale
+
+    def perform_svd(self):
+        self.U, self.S, self.Vh = torch.linalg.svd(self.weight.unsqueeze(0), full_matrices=False)
+        self.done_svd = True    
+
+    def reset_parameters(self):
+        nn.LayerNorm.reset_parameters(self)
+        if hasattr(self, 'delta'):
+            nn.init.zeros_(self.delta)
+
+    def forward(self, x: torch.Tensor):
+        if not self.done_svd:
+            # this happens after loading the state dict 
+            self.perform_svd()
+        weight_updated = self.U.to(x.device, dtype=x.dtype) @ torch.diag(F.relu(self.S.to(x.device, dtype=x.dtype)+self.scale * self.delta)) @ self.Vh.to(x.device, dtype=x.dtype)
+        weight_updated = weight_updated.squeeze(0)
+        return F.layer_norm(x, normalized_shape=self.normalized_shape, weight=weight_updated, bias=self.bias, eps=self.eps)
+    
+
+class SVDGroupNorm(nn.GroupNorm):
+    def __init__(
+        self, 
+        num_groups: int, 
+        num_channels: int,
+        scale: float = 1.0,
+        **kwargs
+    ):
+        nn.GroupNorm.__init__(self, num_groups, num_channels, **kwargs)
+        self.U, self.S, self.Vh = torch.linalg.svd(self.weight.unsqueeze(0), full_matrices=False)
+        # initialize to 0 for smooth tuning 
+        self.delta = nn.Parameter(torch.zeros_like(self.S))
+        self.weight.requires_grad = False
+        self.done_svd = False
+        self.scale = scale
+        self.reset_parameters()
+    
+    def set_scale(self, scale: float):
+        self.scale = scale
+
+    def perform_svd(self):
+        self.U, self.S, self.Vh = torch.linalg.svd(self.weight.unsqueeze(0), full_matrices=False)
+        self.done_svd = True    
+
+    def reset_parameters(self):
+        nn.GroupNorm.reset_parameters(self)
+        if hasattr(self, 'delta'):
+            nn.init.zeros_(self.delta)
+
+    def forward(self, x: torch.Tensor):
+        if not self.done_svd:
+            # this happens after loading the state dict 
+            self.perform_svd()
+        weight_updated = self.U.to(x.device, dtype=x.dtype) @ torch.diag(F.relu(self.S.to(x.device, dtype=x.dtype)+self.scale * self.delta)) @ self.Vh.to(x.device, dtype=x.dtype)
+        weight_updated = weight_updated.squeeze(0)
+        return F.group_norm(x, num_groups=self.num_groups, weight=weight_updated, bias=self.bias, eps=self.eps)
+
@@ -0,0 +1,250 @@
+from typing import Any, Callable, Dict, List, Optional, Union
+import PIL
+import torch
+from diffusers import StableDiffusionPipeline, DDIMInverseScheduler
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img import preprocess
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_pix2pix_zero import Pix2PixInversionPipelineOutput
+
+
+class StableDiffusionPipelineWithDDIMInversion(StableDiffusionPipeline):
+    def __init__(self, vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, requires_safety_checker: bool = True):
+        super().__init__(vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, requires_safety_checker)
+        self.inverse_scheduler = DDIMInverseScheduler.from_config(self.scheduler.config)
+        # self.register_modules(inverse_scheduler=DDIMInverseScheduler.from_config(self.scheduler.config))
+
+
+    def prepare_image_latents(self, image, batch_size, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if isinstance(generator, list):
+            init_latents = [
+                self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
+            ]
+            init_latents = torch.cat(init_latents, dim=0)
+        else:
+            init_latents = self.vae.encode(image).latent_dist.sample(generator)
+
+        init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        latents = init_latents
+
+        return latents
+    
+    def get_epsilon(self, model_output: torch.Tensor, sample: torch.Tensor, timestep: int):
+        pred_type = self.inverse_scheduler.config.prediction_type
+        alpha_prod_t = self.inverse_scheduler.alphas_cumprod[timestep]
+
+        beta_prod_t = 1 - alpha_prod_t
+
+        if pred_type == "epsilon":
+            return model_output
+        elif pred_type == "sample":
+            return (sample - alpha_prod_t ** (0.5) * model_output) / beta_prod_t ** (0.5)
+        elif pred_type == "v_prediction":
+            return (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        else:
+            raise ValueError(
+                f"prediction_type given as {pred_type} must be one of `epsilon`, `sample`, or `v_prediction`"
+            )
+
+    def auto_corr_loss(self, hidden_states, generator=None):
+        batch_size, channel, height, width = hidden_states.shape
+        if batch_size > 1:
+            raise ValueError("Only batch_size 1 is supported for now")
+
+        hidden_states = hidden_states.squeeze(0)
+        # hidden_states must be shape [C,H,W] now
+        reg_loss = 0.0
+        for i in range(hidden_states.shape[0]):
+            noise = hidden_states[i][None, None, :, :]
+            while True:
+                roll_amount = torch.randint(noise.shape[2] // 2, (1,), generator=generator).item()
+                reg_loss += (noise * torch.roll(noise, shifts=roll_amount, dims=2)).mean() ** 2
+                reg_loss += (noise * torch.roll(noise, shifts=roll_amount, dims=3)).mean() ** 2
+
+                if noise.shape[2] <= 8:
+                    break
+                noise = F.avg_pool2d(noise, kernel_size=2)
+        return reg_loss
+
+    def kl_divergence(self, hidden_states):
+        mean = hidden_states.mean()
+        var = hidden_states.var()
+        return var + mean**2 - 1 - torch.log(var + 1e-7)
+
+    
+    # based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py#L1063
+    @torch.no_grad()
+    def invert(
+        self,
+        prompt: Optional[str] = None,
+        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        lambda_auto_corr: float = 20.0,
+        lambda_kl: float = 20.0,
+        num_reg_steps: int = 0, # disabled
+        num_auto_corr_rolls: int = 5,
+    ):
+        # 1. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if cross_attention_kwargs is None:
+            cross_attention_kwargs = {}
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Preprocess image
+        image = preprocess(image)
+
+        # 4. Prepare latent variables
+        latents = self.prepare_image_latents(image, batch_size, self.vae.dtype, device, generator)
+
+        # 5. Encode input prompt
+        num_images_per_prompt = 1
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            prompt_embeds=prompt_embeds,
+        )
+
+        # 4. Prepare timesteps
+        self.inverse_scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.inverse_scheduler.timesteps
+
+        # 7. Denoising loop where we obtain the cross-attention maps.
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.inverse_scheduler.order
+        with self.progress_bar(total=num_inference_steps - 1) as progress_bar:
+            for i, t in enumerate(timesteps[:-1]):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.inverse_scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # regularization of the noise prediction
+                with torch.enable_grad():
+                    for _ in range(num_reg_steps):
+                        if lambda_auto_corr > 0:
+                            for _ in range(num_auto_corr_rolls):
+                                var = torch.autograd.Variable(noise_pred.detach().clone(), requires_grad=True)
+
+                                # Derive epsilon from model output before regularizing to IID standard normal
+                                var_epsilon = self.get_epsilon(var, latent_model_input.detach(), t)
+
+                                l_ac = self.auto_corr_loss(var_epsilon, generator=generator)
+                                l_ac.backward()
+
+                                grad = var.grad.detach() / num_auto_corr_rolls
+                                noise_pred = noise_pred - lambda_auto_corr * grad
+
+                        if lambda_kl > 0:
+                            var = torch.autograd.Variable(noise_pred.detach().clone(), requires_grad=True)
+
+                            # Derive epsilon from model output before regularizing to IID standard normal
+                            var_epsilon = self.get_epsilon(var, latent_model_input.detach(), t)
+
+                            l_kld = self.kl_divergence(var_epsilon)
+                            l_kld.backward()
+
+                            grad = var.grad.detach()
+                            noise_pred = noise_pred - lambda_kl * grad
+
+                        noise_pred = noise_pred.detach()
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.inverse_scheduler.step(noise_pred, t, latents).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.inverse_scheduler.order == 0
+                ):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+
+        inverted_latents = latents.detach().clone()
+
+        # 8. Post-processing
+        image = self.decode_latents(latents.detach())
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        # 9. Convert to PIL.
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (inverted_latents, image)
+
+        return Pix2PixInversionPipelineOutput(latents=inverted_latents, images=image)
+    
+
+
+if __name__ == '__main__':
+    from PIL import Image
+    from diffusers import DDIMScheduler
+    model_id = "CompVis/stable-diffusion-v1-4"
+    input_prompt = "A photo of Barack Obama"
+    prompt = "A photo of Barack Obama smiling with a big grin"
+    url = "obama.png" # https://github.com/cccntu/efficient-prompt-to-prompt/blob/main/ddim-inversion.ipynb
+
+    pipe = StableDiffusionPipelineWithDDIMInversion.from_pretrained(
+        model_id,
+        # make sure to load ddim here
+        scheduler=DDIMScheduler.from_pretrained(model_id, subfolder="scheduler"),
+    )
+    image = Image.open(url).convert("RGB").resize((512, 512))
+    # in SVDiff, they use guidance scale=1 in ddim inversion
+    inv_latents = pipe.invert(input_prompt, image=image, guidance_scale=1.0).latents
+    image = pipe(prompt, latents=inv_latents).images[0]
+    image.save("out.png")
@@ -0,0 +1,2 @@
+# all files in this folder were taken from https://github.com/huggingface/transformers/blob/v4.27.3/src/transformers/models/clip/modeling_clip.py
+# so, these files follow the LICENSE of transformers
@@ -13,10 +13,11 @@
     EulerDiscreteScheduler,
     EulerAncestralDiscreteScheduler,
 )
+from transformers import CLIPTextModel, CLIPTextConfig
 from diffusers import UNet2DConditionModel
 from safetensors.torch import safe_open
-from huggingface_hub import hf_hub_download
-from svdiff_pytorch import UNet2DConditionModelForSVDiff
+import huggingface_hub
+from svdiff_pytorch import UNet2DConditionModelForSVDiff, CLIPTextModelForSVDiff
 
 
 
@@ -38,7 +39,7 @@ def load_unet_for_svdiff(pretrained_model_name_or_path, spectral_shifts_ckpt=Non
     missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
     if len(missing_keys) > 0:
         raise ValueError(
-            f"Cannot load {cls} from {pretrained_model_name_or_path} because the following keys are"
+            f"Cannot load {model.__class__.__name__} from {pretrained_model_name_or_path} because the following keys are"
             f" missing: \n {', '.join(missing_keys)}. \n Please make sure to pass"
             " `low_cpu_mem_usage=False` and `device_map=None` if you want to randomely initialize"
             " those weights or else make sure your checkpoint file is correct."
@@ -57,7 +58,7 @@ def load_unet_for_svdiff(pretrained_model_name_or_path, spectral_shifts_ckpt=Non
         elif not os.path.exists(spectral_shifts_ckpt):
             # download from hub
             hf_hub_kwargs = {} if hf_hub_kwargs is None else hf_hub_kwargs
-            spectral_shifts_ckpt = hf_hub_download(spectral_shifts_ckpt, filename="spectral_shifts.safetensors", **hf_hub_kwargs)
+            spectral_shifts_ckpt = huggingface_hub.hf_hub_download(spectral_shifts_ckpt, filename="spectral_shifts.safetensors", **hf_hub_kwargs)
         assert os.path.exists(spectral_shifts_ckpt)
 
         with safe_open(spectral_shifts_ckpt, framework="pt", device="cpu") as f:
@@ -68,7 +69,7 @@ def load_unet_for_svdiff(pretrained_model_name_or_path, spectral_shifts_ckpt=Non
                     set_module_tensor_to_device(model, key, param_device, value=f.get_tensor(key), dtype=torch_dtype)
                 else:
                     set_module_tensor_to_device(model, key, param_device, value=f.get_tensor(key))
-        print(f"Resume from {spectral_shifts_ckpt}")
+        print(f"Resumed from {spectral_shifts_ckpt}")
     if "torch_dtype"in kwargs:
         model = model.to(kwargs["torch_dtype"])
     model.register_to_config(_name_or_path=pretrained_model_name_or_path)
@@ -80,6 +81,75 @@ def load_unet_for_svdiff(pretrained_model_name_or_path, spectral_shifts_ckpt=Non
 
 
 
+def load_text_encoder_for_svdiff(
+        pretrained_model_name_or_path,
+        spectral_shifts_ckpt=None,
+        hf_hub_kwargs=None,
+        **kwargs
+    ):
+    """
+    https://github.com/huggingface/diffusers/blob/v0.14.0/src/diffusers/models/modeling_utils.py#L541
+    """
+    config = CLIPTextConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+    original_model = CLIPTextModel.from_pretrained(pretrained_model_name_or_path, **kwargs)
+    state_dict = original_model.state_dict()
+    with accelerate.init_empty_weights():
+        model = CLIPTextModelForSVDiff(config)
+    # load pre-trained weights
+    param_device = "cpu"
+    torch_dtype = kwargs["torch_dtype"] if "torch_dtype" in kwargs else None
+    spectral_shifts_weights = {n: torch.zeros(p.shape) for n, p in model.named_parameters() if "delta" in n}
+    state_dict.update(spectral_shifts_weights)
+    # move the params from meta device to cpu
+    missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
+    if len(missing_keys) > 0:
+        raise ValueError(
+            f"Cannot load {model.__class__.__name__} from {pretrained_model_name_or_path} because the following keys are"
+            f" missing: \n {', '.join(missing_keys)}. \n Please make sure to pass"
+            " `low_cpu_mem_usage=False` and `device_map=None` if you want to randomely initialize"
+            " those weights or else make sure your checkpoint file is correct."
+        )
+
+    for param_name, param in state_dict.items():
+        accepts_dtype = "dtype" in set(inspect.signature(set_module_tensor_to_device).parameters.keys())
+        if accepts_dtype:
+            set_module_tensor_to_device(model, param_name, param_device, value=param, dtype=torch_dtype)
+        else:
+            set_module_tensor_to_device(model, param_name, param_device, value=param)
+    
+    if spectral_shifts_ckpt:
+        if os.path.isdir(spectral_shifts_ckpt):
+            spectral_shifts_ckpt = os.path.join(spectral_shifts_ckpt, "spectral_shifts_te.safetensors")
+        elif not os.path.exists(spectral_shifts_ckpt):
+            # download from hub
+            hf_hub_kwargs = {} if hf_hub_kwargs is None else hf_hub_kwargs
+            try:
+                spectral_shifts_ckpt = huggingface_hub.hf_hub_download(spectral_shifts_ckpt, filename="spectral_shifts_te.safetensors", **hf_hub_kwargs)
+            except huggingface_hub.utils.EntryNotFoundError:
+                spectral_shifts_ckpt = None
+        # load state dict only if `spectral_shifts_te.safetensors` exists
+        if os.path.exists(spectral_shifts_ckpt):
+            with safe_open(spectral_shifts_ckpt, framework="pt", device="cpu") as f:
+                for key in f.keys():
+                    # spectral_shifts_weights[key] = f.get_tensor(key)
+                    accepts_dtype = "dtype" in set(inspect.signature(set_module_tensor_to_device).parameters.keys())
+                    if accepts_dtype:
+                        set_module_tensor_to_device(model, key, param_device, value=f.get_tensor(key), dtype=torch_dtype)
+                    else:
+                        set_module_tensor_to_device(model, key, param_device, value=f.get_tensor(key))
+            print(f"Resumed from {spectral_shifts_ckpt}")
+        
+    if "torch_dtype"in kwargs:
+        model = model.to(kwargs["torch_dtype"])
+    # model.register_to_config(_name_or_path=pretrained_model_name_or_path)
+    # Set model in evaluation mode to deactivate DropOut modules by default
+    model.eval()
+    del original_model
+    torch.cuda.empty_cache()
+    return model
+
+
+
 def image_grid(imgs, rows, cols):
     assert len(imgs) == rows * cols
     w, h = imgs[0].size
 
@@ -7,6 +7,7 @@
 from pathlib import Path
 from typing import Optional
 from packaging import version
+import itertools
 
 import numpy as np
 import torch
@@ -22,7 +23,7 @@
 from torch.utils.data import Dataset
 from torchvision import transforms
 from tqdm.auto import tqdm
-from transformers import AutoTokenizer, PretrainedConfig
+from transformers import CLIPTextModel, AutoTokenizer, PretrainedConfig
 
 import diffusers
 from diffusers import __version__
@@ -33,7 +34,7 @@
     StableDiffusionPipeline,
     DPMSolverMultistepScheduler,
 )
-from svdiff_pytorch import load_unet_for_svdiff, SCHEDULER_MAPPING
+from svdiff_pytorch import load_unet_for_svdiff, load_text_encoder_for_svdiff, SCHEDULER_MAPPING
 from diffusers.loaders import AttnProcsLayers
 from diffusers.optimization import get_scheduler
 from diffusers.utils import check_min_version, is_wandb_available
@@ -72,32 +73,12 @@ def save_model_card(repo_id: str, base_model=str, prompt=str, repo_folder=None):
     """
     model_card = f"""
 # SVDiff-pytorch - {repo_id}
-These are SVDiff weights for {base_model}. The weights were trained on {prompt} using [DreamBooth](https://dreambooth.github.io/).
+These are SVDiff weights for {base_model}. The weights were trained on {prompt}.
 """
     with open(os.path.join(repo_folder, "README.md"), "w") as f:
         f.write(yaml + model_card)
 
 
-def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str):
-    text_encoder_config = PretrainedConfig.from_pretrained(
-        pretrained_model_name_or_path,
-        subfolder="text_encoder",
-        revision=revision,
-    )
-    model_class = text_encoder_config.architectures[0]
-
-    if model_class == "CLIPTextModel":
-        from transformers import CLIPTextModel
-
-        return CLIPTextModel
-    elif model_class == "RobertaSeriesModelWithTransformation":
-        from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
-
-        return RobertaSeriesModelWithTransformation
-    else:
-        raise ValueError(f"{model_class} is not supported.")
-
-
 def parse_args(input_args=None):
     parser = argparse.ArgumentParser(description="Simple example of a training script.")
     parser.add_argument(
@@ -271,9 +252,15 @@ def parse_args(input_args=None):
     parser.add_argument(
         "--learning_rate",
         type=float,
-        default=5e-4,
+        default=1e-3,
         help="Initial learning rate (after the potential warmup period) to use.",
     )
+    parser.add_argument(
+        "--learning_rate_1d",
+        type=float,
+        default=1e-6,
+        help="Initial learning rate (after the potential warmup period) to use for 1-d weights",
+    )
     parser.add_argument(
         "--scale_lr",
         action="store_true",
@@ -380,6 +367,11 @@ def parse_args(input_args=None):
     parser.add_argument(
         "--enable_token_merging", action="store_true", help="Whether or not to use tomesd on prior generation"
     )
+    parser.add_argument(
+        "--train_text_encoder",
+        action="store_true",
+        help="Whether to train spectral shifts of the text encoder. If set, the text encoder should be float32 precision.",
+    )
     if input_args is not None:
         args = parser.parse_args(input_args)
     else:
@@ -594,6 +586,11 @@ def main(args):
     # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate
     # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models.
     # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate.
+    if args.train_text_encoder and args.gradient_accumulation_steps > 1 and accelerator.num_processes > 1:
+        raise ValueError(
+            "Gradient accumulation is not supported when training the text encoder in distributed training. "
+            "Please set gradient_accumulation_steps to 1. This feature will be supported in the future."
+        )
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -700,14 +697,14 @@ def main(args):
             use_fast=False,
         )
 
-    # import correct text encoder class
-    text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision)
-
     # Load scheduler and models
     noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
-    text_encoder = text_encoder_cls.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
-    )
+    if args.train_text_encoder:
+        text_encoder = load_text_encoder_for_svdiff(args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision)
+    else:
+        text_encoder = CLIPTextModel.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
+        )
     vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision)
     unet = load_unet_for_svdiff(args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, low_cpu_mem_usage=True)
 
@@ -716,26 +713,26 @@ def main(args):
     text_encoder.requires_grad_(False)
     unet.requires_grad_(False)
     optim_params = []
+    optim_params_1d = []
     for n, p in unet.named_parameters():
         if "delta" in n:
             p.requires_grad = True
-            optim_params.append(p)
+            if "norm" in n:
+                optim_params_1d.append(p)
+            else:
+                optim_params.append(p)
+    if args.train_text_encoder:
+        for n, p in text_encoder.named_parameters():
+            if "delta" in n:
+                p.requires_grad = True
+                if "norm" in n:
+                    optim_params_1d.append(p)
+                else:
+                    optim_params.append(p)
+        
     total_params = sum(p.numel() for p in optim_params)
     print(f"Number of Trainable Parameters: {total_params * 1.e-6:.2f} M")
 
-    # For mixed precision training we cast the text_encoder and vae weights to half-precision
-    # as these models are only used for inference, keeping weights in full precision is not required.
-    weight_dtype = torch.float32
-    if accelerator.mixed_precision == "fp16":
-        weight_dtype = torch.float16
-    elif accelerator.mixed_precision == "bf16":
-        weight_dtype = torch.bfloat16
-
-    # Move unet, vae and text_encoder to device and cast to weight_dtype
-    # unet.to(accelerator.device, dtype=weight_dtype)
-    vae.to(accelerator.device, dtype=weight_dtype)
-    text_encoder.to(accelerator.device, dtype=weight_dtype)
-
     if args.enable_xformers_memory_efficient_attention:
         if is_xformers_available():
             import xformers
@@ -751,12 +748,26 @@ def main(args):
 
     if args.gradient_checkpointing:
         unet.enable_gradient_checkpointing()
+        if args.train_text_encoder:
+            text_encoder.gradient_checkpointing_enable()
 
-    if args.scale_lr:
-        args.learning_rate = (
-            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+    # Check that all trainable models are in full precision
+    low_precision_error_string = (
+        "Please make sure to always have all model weights in full float32 precision when starting training - even if"
+        " doing mixed precision training. copy of the weights should still be float32."
+    )
+    
+    if accelerator.unwrap_model(unet).dtype != torch.float32:
+        raise ValueError(
+            f"Unet loaded as datatype {accelerator.unwrap_model(unet).dtype}. {low_precision_error_string}"
         )
 
+    if args.train_text_encoder and accelerator.unwrap_model(text_encoder).dtype != torch.float32:
+        raise ValueError(
+            f"Text encoder loaded as datatype {accelerator.unwrap_model(text_encoder).dtype}."
+            f" {low_precision_error_string}"
+        )
+    
     # Enable TF32 for faster training on Ampere GPUs,
     # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
     if args.allow_tf32:
@@ -782,7 +793,7 @@ def main(args):
 
     # Optimizer creation
     optimizer = optimizer_class(
-        optim_params,
+        [{"params": optim_params}, {"params": optim_params_1d, "lr": args.learning_rate_1d}],
         lr=args.learning_rate,
         betas=(args.adam_beta1, args.adam_beta2),
         weight_decay=args.adam_weight_decay,
@@ -826,9 +837,29 @@ def main(args):
     )
 
     # Prepare everything with our `accelerator`.
-    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-        unet, optimizer, train_dataloader, lr_scheduler
-    )
+    if args.train_text_encoder:
+        unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, text_encoder, optimizer, train_dataloader, lr_scheduler
+        )
+    else:
+        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, optimizer, train_dataloader, lr_scheduler
+        )
+
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    # unet.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+    if not args.train_text_encoder:
+        text_encoder.to(accelerator.device, dtype=weight_dtype)
+
 
     # We need to recalculate our total training steps as the size of the training dataloader may have changed.
     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
@@ -842,14 +873,27 @@ def main(args):
     if accelerator.is_main_process:
         accelerator.init_trackers("svdiff-pytorch", config=vars(args))
 
-    def save_weights(step):
+    # cache keys to save
+    state_dict_keys = [k for k in accelerator.unwrap_model(unet).state_dict().keys() if "delta" in k]
+    if args.train_text_encoder:
+        state_dict_keys_te = [k for k in accelerator.unwrap_model(text_encoder).state_dict().keys() if "delta" in k]
+
+    def save_weights(step, save_path=None):
         # Create the pipeline using using the trained modules and save it.
         if accelerator.is_main_process:
-            save_path = os.path.join(args.output_dir, f"checkpoint-{step}")
+            if save_path is None:
+                save_path = os.path.join(args.output_dir, f"checkpoint-{step}")
             os.makedirs(save_path, exist_ok=True)
-            unet_model = accelerator.unwrap_model(unet, keep_fp32_wrapper=True)
-            state_dict = {k: v for k, v in unet_model.state_dict().items() if "delta" in k}
+            state_dict = accelerator.unwrap_model(unet, keep_fp32_wrapper=True).state_dict()
+            # state_dict = {k: v for k, v in unet_model.state_dict().items() if "delta" in k}
+            state_dict = {k: state_dict[k] for k in state_dict_keys}
             save_file(state_dict, os.path.join(save_path, "spectral_shifts.safetensors"))
+            if args.train_text_encoder:
+                state_dict = accelerator.unwrap_model(text_encoder, keep_fp32_wrapper=True).state_dict()
+                # state_dict = {k: v for k, v in unet_model.state_dict().items() if "delta" in k}
+                state_dict = {k: state_dict[k] for k in state_dict_keys_te}
+                save_file(state_dict, os.path.join(save_path, "spectral_shifts_te.safetensors"))
+
             print(f"[*] Weights saved at {save_path}")
 
     # Train!
@@ -897,6 +941,8 @@ def save_weights(step):
 
     for epoch in range(first_epoch, args.num_train_epochs):
         unet.train()
+        if args.train_text_encoder:
+            text_encoder.train()
         for step, batch in enumerate(train_dataloader):
             # Skip steps until we reach the resumed step
             if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
@@ -952,7 +998,11 @@ def save_weights(step):
 
                 accelerator.backward(loss)
                 if accelerator.sync_gradients:
-                    params_to_clip = unet.parameters()
+                    params_to_clip = (
+                        itertools.chain(unet.parameters(), text_encoder.parameters())
+                        if args.train_text_encoder
+                        else unet.parameters()
+                    )
                     accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
                 optimizer.step()
                 lr_scheduler.step()
@@ -970,7 +1020,7 @@ def save_weights(step):
                         # accelerator.save_state(save_path)
                         # logger.info(f"Saved state to {save_path}")
 
-            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0], "lr_1d": lr_scheduler.get_last_lr()[1]}
             progress_bar.set_postfix(**logs)
             accelerator.log(logs, step=global_step)
 
@@ -982,14 +1032,8 @@ def save_weights(step):
                 log_validation(text_encoder, tokenizer, unet, vae, args, accelerator, weight_dtype, epoch)
 
     accelerator.wait_for_everyone()
-    save_weights(global_step)
     # put the latest checkpoint to output-dir
-    save_path = args.output_dir
-    unet_model = accelerator.unwrap_model(unet, keep_fp32_wrapper=True)
-    state_dict = {k: v for k, v in unet_model.state_dict().items() if "delta" in k}
-    save_file(state_dict, os.path.join(save_path, "spectral_shifts.safetensors"))
-    print(f"[*] Weights saved at {save_path}")
-    
+    save_weights(global_step, save_path=args.output_dir)    
     if accelerator.is_main_process:
         if args.push_to_hub:
             save_model_card(
Original file line number	Diff line number	Diff line change
`@@ -137,7 +137,7 @@ def __init__(`
`137`	`137`	`in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias`
`138`	`138`	`)`
`139`	`139`	`if layer_norm:`
`140`		`- self.norm = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)`
	`140`	`+ self.norm = SVDLayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)`
`141`	`141`	`else:`
`142`	`142`	`self.norm = None`
`143`	`143`
Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@`
`22`	`22`	`from svdiff_pytorch.diffusers_models.dual_transformer_2d import DualTransformer2DModel`
`23`	`23`	`from svdiff_pytorch.diffusers_models.resnet import Downsample2D, FirDownsample2D, FirUpsample2D, KDownsample2D, KUpsample2D, ResnetBlock2D, Upsample2D`
`24`	`24`	`from svdiff_pytorch.diffusers_models.transformer_2d import Transformer2DModel`
`25`		`-from svdiff_pytorch.layers import SVDConv1d, SVDConv2d, SVDLinear`
	`25`	`+from svdiff_pytorch.layers import SVDConv1d, SVDConv2d, SVDLinear, SVDLayerNorm, SVDGroupNorm`
`26`	`26`
`27`	`27`
`28`	`28`	`def get_down_block(`
`@@ -2089,7 +2089,7 @@ def __init__(`
`2089`	`2089`	`kernel="fir",`
`2090`	`2090`	`)`
`2091`	`2091`	`self.skip_conv = SVDConv2d(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))`
`2092`		`- self.skip_norm = torch.nn.GroupNorm(`
	`2092`	`+ self.skip_norm = SVDGroupNorm(`
`2093`	`2093`	`num_groups=min(out_channels // 4, 32), num_channels=out_channels, eps=resnet_eps, affine=True`
`2094`	`2094`	`)`
`2095`	`2095`	`self.act = nn.SiLU()`
`@@ -2186,7 +2186,7 @@ def __init__(`
`2186`	`2186`	`kernel="fir",`
`2187`	`2187`	`)`
`2188`	`2188`	`self.skip_conv = SVDConv2d(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))`
`2189`		`- self.skip_norm = torch.nn.GroupNorm(`
	`2189`	`+ self.skip_norm = SVDGroupNorm(`
`2190`	`2190`	`num_groups=min(out_channels // 4, 32), num_channels=out_channels, eps=resnet_eps, affine=True`
`2191`	`2191`	`)`
`2192`	`2192`	`self.act = nn.SiLU()`
Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@`
`34`	`34`	`get_down_block,`
`35`	`35`	`get_up_block,`
`36`	`36`	`)`
`37`		`-from svdiff_pytorch.layers import SVDConv1d, SVDConv2d, SVDLinear`
	`37`	`+from svdiff_pytorch.layers import SVDConv1d, SVDConv2d, SVDLinear, SVDGroupNorm, SVDLayerNorm`
`38`	`38`
`39`	`39`
`40`	`40`	`logger = logging.get_logger(__name__) # pylint: disable=invalid-name`
`@@ -348,7 +348,7 @@ def __init__(`
`348`	`348`
`349`	`349`	`# out`
`350`	`350`	`if norm_num_groups is not None:`
`351`		`- self.conv_norm_out = nn.GroupNorm(`
	`351`	`+ self.conv_norm_out = SVDGroupNorm(`
`352`	`352`	`num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps`
`353`	`353`	`)`
`354`	`354`	`self.conv_act = nn.SiLU()`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# all files in this folder were taken from https://github.com/huggingface/transformers/blob/v4.27.3/src/transformers/models/clip/modeling_clip.py`
	`2`	`+# so, these files follow the LICENSE of transformers`