From 057ea7b504aeea739876f18a28254467199943f5 Mon Sep 17 00:00:00 2001
From: yanzewu <wuyanze123@gmail.com>
Date: Sat, 14 Sep 2024 17:01:37 +0800
Subject: [PATCH] support 16GB cards

---
 README.md              |  2 +-
 app_flux.py            | 33 ++++++++++++++++++++++++---------
 docs/pulid_for_flux.md | 12 +++++++-----
 pulid/pipeline_flux.py | 17 ++++++++++++-----
 requirements.txt       |  1 +
 requirements_fp8.txt   |  2 +-
 6 files changed, 46 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index 2b0fbec..ef1623d 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ We will actively update and maintain this repository in the near future, so plea
 ### updates
 - [x] Local gradio demo is ready now
 - [x] Online HuggingFace demo is ready now [![flux](https://img.shields.io/badge/🤗-PuLID_FLUX_demo-orange)](https://huggingface.co/spaces/yanze/PuLID-FLUX)
-- [x] We optimize the codes to support consumer-grade GPUS, the peak memory is now under 24GB (17GB if you use fp8 model). Check the details [here](https://github.com/ToTheBeginning/PuLID/blob/main/docs/pulid_for_flux.md#local-gradio-demo)
+- [x] We have optimized the codes to support consumer-grade GPUS, and now **PuLID-FLUX can run on a 16GB graphic card**. Check the details [here](https://github.com/ToTheBeginning/PuLID/blob/main/docs/pulid_for_flux.md#local-gradio-demo)
 
 
 Below results are generated with PuLID-FLUX.
diff --git a/app_flux.py b/app_flux.py
index f045f6f..39a7a45 100644
--- a/app_flux.py
+++ b/app_flux.py
@@ -42,7 +42,13 @@ def __init__(self, model_name: str, device: str, offload: bool, aggressive_offlo
             offload=self.offload,
             fp8=args.fp8,
         )
-        self.pulid_model = PuLIDPipeline(self.model, device, weight_dtype=torch.bfloat16)
+        self.pulid_model = PuLIDPipeline(self.model, device="cpu" if offload else device, weight_dtype=torch.bfloat16,
+                                         onnx_provider=args.onnx_provider)
+        if offload:
+            self.pulid_model.face_helper.face_det.mean_tensor = self.pulid_model.face_helper.face_det.mean_tensor.to(torch.device("cuda"))
+            self.pulid_model.face_helper.face_det.device = torch.device("cuda")
+            self.pulid_model.face_helper.device = torch.device("cuda")
+            self.pulid_model.device = torch.device("cuda")
         self.pulid_model.load_pretrain(args.pretrained_model)
 
     @torch.inference_mode()
@@ -84,13 +90,6 @@ def generate_image(
 
         use_true_cfg = abs(true_cfg - 1.0) > 1e-2
 
-        if id_image is not None:
-            id_image = resize_numpy_image_long(id_image, 1024)
-            id_embeddings, uncond_id_embeddings = self.pulid_model.get_id_embedding(id_image, cal_uncond=use_true_cfg)
-        else:
-            id_embeddings = None
-            uncond_id_embeddings = None
-
         # prepare input
         x = get_noise(
             1,
@@ -111,10 +110,23 @@ def generate_image(
         inp = prepare(t5=self.t5, clip=self.clip, img=x, prompt=opts.prompt)
         inp_neg = prepare(t5=self.t5, clip=self.clip, img=x, prompt=neg_prompt) if use_true_cfg else None
 
-        # offload TEs to CPU, load model to gpu
+        # offload TEs to CPU, load processor models and id encoder to gpu
         if self.offload:
             self.t5, self.clip = self.t5.cpu(), self.clip.cpu()
             torch.cuda.empty_cache()
+            self.pulid_model.components_to_device(torch.device("cuda"))
+
+        if id_image is not None:
+            id_image = resize_numpy_image_long(id_image, 1024)
+            id_embeddings, uncond_id_embeddings = self.pulid_model.get_id_embedding(id_image, cal_uncond=use_true_cfg)
+        else:
+            id_embeddings = None
+            uncond_id_embeddings = None
+
+        # offload processor models and id encoder to CPU, load dit model to gpu
+        if self.offload:
+            self.pulid_model.components_to_device(torch.device("cpu"))
+            torch.cuda.empty_cache()
             if self.aggressive_offload:
                 self.model.components_to_gpu()
             else:
@@ -299,6 +311,9 @@ def create_demo(args, model_name: str, device: str = "cuda" if torch.cuda.is_ava
     parser.add_argument("--offload", action="store_true", help="Offload model to CPU when not in use")
     parser.add_argument("--aggressive_offload", action="store_true", help="Offload model more aggressively to CPU when not in use, for 24G GPUs")
     parser.add_argument("--fp8", action="store_true", help="use flux-dev-fp8 model")
+    parser.add_argument("--onnx_provider", type=str, default="gpu", choices=["gpu", "cpu"],
+                        help="set onnx_provider to cpu (default gpu) can help reduce RAM usage, and when combined with"
+                             "fp8 option, the peak RAM is under 15GB")
     parser.add_argument("--port", type=int, default=8080, help="Port to use")
     parser.add_argument("--dev", action='store_true', help="Development mode")
     parser.add_argument("--pretrained_model", type=str, help='for development')
diff --git a/docs/pulid_for_flux.md b/docs/pulid_for_flux.md
index 06f013c..f5e8ffd 100644
--- a/docs/pulid_for_flux.md
+++ b/docs/pulid_for_flux.md
@@ -11,19 +11,21 @@ up the environment, and download the `flux1-dev.safetensors` (if you want to use
 There are following four options to run the gradio demo:
 
 #### naive bf16
-simply run `python app_flux.py`, the peak memory is around 45GB.
+simply run `python app_flux.py`, the peak memory is under 45GB.
 
 #### bf16 + offload
-run `python app_flux.py --offload`, the peak memory is around 30GB.
+run `python app_flux.py --offload`, the peak memory is under 30GB.
 
-#### fp8 + offload
+#### fp8 + offload  (for consumer-grade GPUs)
 To use fp8, you need to make sure you have installed `requirements-fp8.txt`, it includes `optimum-quanto` and higher version of PyTorch.
 
-Run `python app_flux.py --offload --fp8`, the peak memory is around 17GB.
+Run `python app_flux.py --offload --fp8 --onnx_provider cpu`, the peak memory is under 15GB, this is for GPU with 16GB memory.
+
+For 24GB graphic memory users, you can run `python app_flux.py --offload --fp8`, the peak memory is under 17GB.
 
 However, there is a difference in image quality between fp8 and bf16, with some degradation in the former. 
 Specifically, the details of the face may be slightly worse, but the layout is similar. If you want the best results
-of PuLID-FLUX, please use bf16 rather than fp8.
+of PuLID-FLUX or you have the resources, please use bf16 rather than fp8.
 We have included a comparison in the table below.
 
 |      |                                            case1                                            |                                            case2                                             |                                            case3                                            |                                           case4                                          |
diff --git a/pulid/pipeline_flux.py b/pulid/pipeline_flux.py
index 7d18791..7a528d1 100644
--- a/pulid/pipeline_flux.py
+++ b/pulid/pipeline_flux.py
@@ -19,7 +19,7 @@
 
 
 class PuLIDPipeline(nn.Module):
-    def __init__(self, dit, device, weight_dtype=torch.bfloat16, *args, **kwargs):
+    def __init__(self, dit, device, weight_dtype=torch.bfloat16, onnx_provider='gpu', *args, **kwargs):
         super().__init__()
         self.device = device
         self.weight_dtype = weight_dtype
@@ -68,12 +68,12 @@ def __init__(self, dit, device, weight_dtype=torch.bfloat16, *args, **kwargs):
         self.eva_transform_std = eva_transform_std
         # antelopev2
         snapshot_download('DIAMONIK7777/antelopev2', local_dir='models/antelopev2')
-        self.app = FaceAnalysis(
-            name='antelopev2', root='.', providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
-        )
+        providers = ['CPUExecutionProvider'] if onnx_provider == 'cpu' \
+            else ['CUDAExecutionProvider', 'CPUExecutionProvider']
+        self.app = FaceAnalysis(name='antelopev2', root='.', providers=providers)
         self.app.prepare(ctx_id=0, det_size=(640, 640))
         self.handler_ante = insightface.model_zoo.get_model('models/antelopev2/glintr100.onnx',
-                                                            providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
+                                                            providers=providers)
         self.handler_ante.prepare(ctx_id=0)
 
         gc.collect()
@@ -84,6 +84,13 @@ def __init__(self, dit, device, weight_dtype=torch.bfloat16, *args, **kwargs):
         # other configs
         self.debug_img_list = []
 
+    def components_to_device(self, device):
+        # everything but pulid_ca
+        self.face_helper.face_det = self.face_helper.face_det.to(device)
+        self.face_helper.face_parse = self.face_helper.face_parse.to(device)
+        self.clip_vision_model = self.clip_vision_model.to(device)
+        self.pulid_encoder = self.pulid_encoder.to(device)
+
     def load_pretrain(self, pretrain_path=None):
         hf_hub_download('guozinan/PuLID', 'pulid_flux_v0.9.0.safetensors', local_dir='models')
         ckpt_path = 'models/pulid_flux_v0.9.0.safetensors'
diff --git a/requirements.txt b/requirements.txt
index 413c019..3b5c67e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,6 +10,7 @@ einops
 ftfy
 facexlib
 insightface
+onnxruntime
 onnxruntime-gpu
 accelerate
 SentencePiece
diff --git a/requirements_fp8.txt b/requirements_fp8.txt
index c631956..7159e9c 100644
--- a/requirements_fp8.txt
+++ b/requirements_fp8.txt
@@ -11,8 +11,8 @@ einops
 ftfy
 facexlib
 insightface
+onnxruntime
 onnxruntime-gpu
 accelerate
 SentencePiece
-fire
 safetensors
\ No newline at end of file