support 16GB cards

ToTheBeginning · Sep 14, 2024 · 057ea7b · 057ea7b
1 parent 6af4231
commit 057ea7b
Show file tree

Hide file tree

Showing 6 changed files with 46 additions and 21 deletions.
diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ We will actively update and maintain this repository in the near future, so plea
 ### updates
 - [x] Local gradio demo is ready now
 - [x] Online HuggingFace demo is ready now [![flux](https://img.shields.io/badge/🤗-PuLID_FLUX_demo-orange)](https://huggingface.co/spaces/yanze/PuLID-FLUX)
-- [x] We optimize the codes to support consumer-grade GPUS, the peak memory is now under 24GB (17GB if you use fp8 model). Check the details [here](https://github.com/ToTheBeginning/PuLID/blob/main/docs/pulid_for_flux.md#local-gradio-demo)
+- [x] We have optimized the codes to support consumer-grade GPUS, and now **PuLID-FLUX can run on a 16GB graphic card**. Check the details [here](https://github.com/ToTheBeginning/PuLID/blob/main/docs/pulid_for_flux.md#local-gradio-demo)
 
 
 Below results are generated with PuLID-FLUX.

diff --git a/app_flux.py b/app_flux.py
@@ -42,7 +42,13 @@ def __init__(self, model_name: str, device: str, offload: bool, aggressive_offlo
             offload=self.offload,
             fp8=args.fp8,
         )
-        self.pulid_model = PuLIDPipeline(self.model, device, weight_dtype=torch.bfloat16)
+        self.pulid_model = PuLIDPipeline(self.model, device="cpu" if offload else device, weight_dtype=torch.bfloat16,
+                                         onnx_provider=args.onnx_provider)
+        if offload:
+            self.pulid_model.face_helper.face_det.mean_tensor = self.pulid_model.face_helper.face_det.mean_tensor.to(torch.device("cuda"))
+            self.pulid_model.face_helper.face_det.device = torch.device("cuda")
+            self.pulid_model.face_helper.device = torch.device("cuda")
+            self.pulid_model.device = torch.device("cuda")
         self.pulid_model.load_pretrain(args.pretrained_model)
 
     @torch.inference_mode()
@@ -84,13 +90,6 @@ def generate_image(
 
         use_true_cfg = abs(true_cfg - 1.0) > 1e-2
 
-        if id_image is not None:
-            id_image = resize_numpy_image_long(id_image, 1024)
-            id_embeddings, uncond_id_embeddings = self.pulid_model.get_id_embedding(id_image, cal_uncond=use_true_cfg)
-        else:
-            id_embeddings = None
-            uncond_id_embeddings = None
-
         # prepare input
         x = get_noise(
             1,
@@ -111,10 +110,23 @@ def generate_image(
         inp = prepare(t5=self.t5, clip=self.clip, img=x, prompt=opts.prompt)
         inp_neg = prepare(t5=self.t5, clip=self.clip, img=x, prompt=neg_prompt) if use_true_cfg else None
 
-        # offload TEs to CPU, load model to gpu
+        # offload TEs to CPU, load processor models and id encoder to gpu
         if self.offload:
             self.t5, self.clip = self.t5.cpu(), self.clip.cpu()
             torch.cuda.empty_cache()
+            self.pulid_model.components_to_device(torch.device("cuda"))
+
+        if id_image is not None:
+            id_image = resize_numpy_image_long(id_image, 1024)
+            id_embeddings, uncond_id_embeddings = self.pulid_model.get_id_embedding(id_image, cal_uncond=use_true_cfg)
+        else:
+            id_embeddings = None
+            uncond_id_embeddings = None
+
+        # offload processor models and id encoder to CPU, load dit model to gpu
+        if self.offload:
+            self.pulid_model.components_to_device(torch.device("cpu"))
+            torch.cuda.empty_cache()
             if self.aggressive_offload:
                 self.model.components_to_gpu()
             else:
@@ -299,6 +311,9 @@ def create_demo(args, model_name: str, device: str = "cuda" if torch.cuda.is_ava
     parser.add_argument("--offload", action="store_true", help="Offload model to CPU when not in use")
     parser.add_argument("--aggressive_offload", action="store_true", help="Offload model more aggressively to CPU when not in use, for 24G GPUs")
     parser.add_argument("--fp8", action="store_true", help="use flux-dev-fp8 model")
+    parser.add_argument("--onnx_provider", type=str, default="gpu", choices=["gpu", "cpu"],
+                        help="set onnx_provider to cpu (default gpu) can help reduce RAM usage, and when combined with"
+                             "fp8 option, the peak RAM is under 15GB")
     parser.add_argument("--port", type=int, default=8080, help="Port to use")
     parser.add_argument("--dev", action='store_true', help="Development mode")
     parser.add_argument("--pretrained_model", type=str, help='for development')

diff --git a/docs/pulid_for_flux.md b/docs/pulid_for_flux.md
@@ -11,19 +11,21 @@ up the environment, and download the `flux1-dev.safetensors` (if you want to use
 There are following four options to run the gradio demo:
 
 #### naive bf16
-simply run `python app_flux.py`, the peak memory is around 45GB.
+simply run `python app_flux.py`, the peak memory is under 45GB.
 
 #### bf16 + offload
-run `python app_flux.py --offload`, the peak memory is around 30GB.
+run `python app_flux.py --offload`, the peak memory is under 30GB.
 
-#### fp8 + offload
+#### fp8 + offload  (for consumer-grade GPUs)
 To use fp8, you need to make sure you have installed `requirements-fp8.txt`, it includes `optimum-quanto` and higher version of PyTorch.
 
-Run `python app_flux.py --offload --fp8`, the peak memory is around 17GB.
+Run `python app_flux.py --offload --fp8 --onnx_provider cpu`, the peak memory is under 15GB, this is for GPU with 16GB memory.
+
+For 24GB graphic memory users, you can run `python app_flux.py --offload --fp8`, the peak memory is under 17GB.
 
 However, there is a difference in image quality between fp8 and bf16, with some degradation in the former. 
 Specifically, the details of the face may be slightly worse, but the layout is similar. If you want the best results
-of PuLID-FLUX, please use bf16 rather than fp8.
+of PuLID-FLUX or you have the resources, please use bf16 rather than fp8.
 We have included a comparison in the table below.
 
 |      |                                            case1                                            |                                            case2                                             |                                            case3                                            |                                           case4                                          |

diff --git a/pulid/pipeline_flux.py b/pulid/pipeline_flux.py
@@ -19,7 +19,7 @@
 
 
 class PuLIDPipeline(nn.Module):
-    def __init__(self, dit, device, weight_dtype=torch.bfloat16, *args, **kwargs):
+    def __init__(self, dit, device, weight_dtype=torch.bfloat16, onnx_provider='gpu', *args, **kwargs):
         super().__init__()
         self.device = device
         self.weight_dtype = weight_dtype
@@ -68,12 +68,12 @@ def __init__(self, dit, device, weight_dtype=torch.bfloat16, *args, **kwargs):
         self.eva_transform_std = eva_transform_std
         # antelopev2
         snapshot_download('DIAMONIK7777/antelopev2', local_dir='models/antelopev2')
-        self.app = FaceAnalysis(
-            name='antelopev2', root='.', providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
-        )
+        providers = ['CPUExecutionProvider'] if onnx_provider == 'cpu' \
+            else ['CUDAExecutionProvider', 'CPUExecutionProvider']
+        self.app = FaceAnalysis(name='antelopev2', root='.', providers=providers)
         self.app.prepare(ctx_id=0, det_size=(640, 640))
         self.handler_ante = insightface.model_zoo.get_model('models/antelopev2/glintr100.onnx',
-                                                            providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
+                                                            providers=providers)
         self.handler_ante.prepare(ctx_id=0)
 
         gc.collect()
@@ -84,6 +84,13 @@ def __init__(self, dit, device, weight_dtype=torch.bfloat16, *args, **kwargs):
         # other configs
         self.debug_img_list = []
 
+    def components_to_device(self, device):
+        # everything but pulid_ca
+        self.face_helper.face_det = self.face_helper.face_det.to(device)
+        self.face_helper.face_parse = self.face_helper.face_parse.to(device)
+        self.clip_vision_model = self.clip_vision_model.to(device)
+        self.pulid_encoder = self.pulid_encoder.to(device)
+
     def load_pretrain(self, pretrain_path=None):
         hf_hub_download('guozinan/PuLID', 'pulid_flux_v0.9.0.safetensors', local_dir='models')
         ckpt_path = 'models/pulid_flux_v0.9.0.safetensors'

diff --git a/requirements.txt b/requirements.txt
@@ -10,6 +10,7 @@ einops
 ftfy
 facexlib
 insightface
+onnxruntime
 onnxruntime-gpu
 accelerate
 SentencePiece

diff --git a/requirements_fp8.txt b/requirements_fp8.txt
@@ -11,8 +11,8 @@ einops
 ftfy
 facexlib
 insightface
+onnxruntime
 onnxruntime-gpu
 accelerate
 SentencePiece
-fire
 safetensors