Skip to content

Commit

Permalink
support 16GB cards
Browse files Browse the repository at this point in the history
  • Loading branch information
ToTheBeginning committed Sep 14, 2024
1 parent 6af4231 commit 057ea7b
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 21 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ We will actively update and maintain this repository in the near future, so plea
### updates
- [x] Local gradio demo is ready now
- [x] Online HuggingFace demo is ready now [![flux](https://img.shields.io/badge/🤗-PuLID_FLUX_demo-orange)](https://huggingface.co/spaces/yanze/PuLID-FLUX)
- [x] We optimize the codes to support consumer-grade GPUS, the peak memory is now under 24GB (17GB if you use fp8 model). Check the details [here](https://github.com/ToTheBeginning/PuLID/blob/main/docs/pulid_for_flux.md#local-gradio-demo)
- [x] We have optimized the codes to support consumer-grade GPUS, and now **PuLID-FLUX can run on a 16GB graphic card**. Check the details [here](https://github.com/ToTheBeginning/PuLID/blob/main/docs/pulid_for_flux.md#local-gradio-demo)


Below results are generated with PuLID-FLUX.
Expand Down
33 changes: 24 additions & 9 deletions app_flux.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,13 @@ def __init__(self, model_name: str, device: str, offload: bool, aggressive_offlo
offload=self.offload,
fp8=args.fp8,
)
self.pulid_model = PuLIDPipeline(self.model, device, weight_dtype=torch.bfloat16)
self.pulid_model = PuLIDPipeline(self.model, device="cpu" if offload else device, weight_dtype=torch.bfloat16,
onnx_provider=args.onnx_provider)
if offload:
self.pulid_model.face_helper.face_det.mean_tensor = self.pulid_model.face_helper.face_det.mean_tensor.to(torch.device("cuda"))
self.pulid_model.face_helper.face_det.device = torch.device("cuda")
self.pulid_model.face_helper.device = torch.device("cuda")
self.pulid_model.device = torch.device("cuda")
self.pulid_model.load_pretrain(args.pretrained_model)

@torch.inference_mode()
Expand Down Expand Up @@ -84,13 +90,6 @@ def generate_image(

use_true_cfg = abs(true_cfg - 1.0) > 1e-2

if id_image is not None:
id_image = resize_numpy_image_long(id_image, 1024)
id_embeddings, uncond_id_embeddings = self.pulid_model.get_id_embedding(id_image, cal_uncond=use_true_cfg)
else:
id_embeddings = None
uncond_id_embeddings = None

# prepare input
x = get_noise(
1,
Expand All @@ -111,10 +110,23 @@ def generate_image(
inp = prepare(t5=self.t5, clip=self.clip, img=x, prompt=opts.prompt)
inp_neg = prepare(t5=self.t5, clip=self.clip, img=x, prompt=neg_prompt) if use_true_cfg else None

# offload TEs to CPU, load model to gpu
# offload TEs to CPU, load processor models and id encoder to gpu
if self.offload:
self.t5, self.clip = self.t5.cpu(), self.clip.cpu()
torch.cuda.empty_cache()
self.pulid_model.components_to_device(torch.device("cuda"))

if id_image is not None:
id_image = resize_numpy_image_long(id_image, 1024)
id_embeddings, uncond_id_embeddings = self.pulid_model.get_id_embedding(id_image, cal_uncond=use_true_cfg)
else:
id_embeddings = None
uncond_id_embeddings = None

# offload processor models and id encoder to CPU, load dit model to gpu
if self.offload:
self.pulid_model.components_to_device(torch.device("cpu"))
torch.cuda.empty_cache()
if self.aggressive_offload:
self.model.components_to_gpu()
else:
Expand Down Expand Up @@ -299,6 +311,9 @@ def create_demo(args, model_name: str, device: str = "cuda" if torch.cuda.is_ava
parser.add_argument("--offload", action="store_true", help="Offload model to CPU when not in use")
parser.add_argument("--aggressive_offload", action="store_true", help="Offload model more aggressively to CPU when not in use, for 24G GPUs")
parser.add_argument("--fp8", action="store_true", help="use flux-dev-fp8 model")
parser.add_argument("--onnx_provider", type=str, default="gpu", choices=["gpu", "cpu"],
help="set onnx_provider to cpu (default gpu) can help reduce RAM usage, and when combined with"
"fp8 option, the peak RAM is under 15GB")
parser.add_argument("--port", type=int, default=8080, help="Port to use")
parser.add_argument("--dev", action='store_true', help="Development mode")
parser.add_argument("--pretrained_model", type=str, help='for development')
Expand Down
12 changes: 7 additions & 5 deletions docs/pulid_for_flux.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,21 @@ up the environment, and download the `flux1-dev.safetensors` (if you want to use
There are following four options to run the gradio demo:

#### naive bf16
simply run `python app_flux.py`, the peak memory is around 45GB.
simply run `python app_flux.py`, the peak memory is under 45GB.

#### bf16 + offload
run `python app_flux.py --offload`, the peak memory is around 30GB.
run `python app_flux.py --offload`, the peak memory is under 30GB.

#### fp8 + offload
#### fp8 + offload (for consumer-grade GPUs)
To use fp8, you need to make sure you have installed `requirements-fp8.txt`, it includes `optimum-quanto` and higher version of PyTorch.

Run `python app_flux.py --offload --fp8`, the peak memory is around 17GB.
Run `python app_flux.py --offload --fp8 --onnx_provider cpu`, the peak memory is under 15GB, this is for GPU with 16GB memory.

For 24GB graphic memory users, you can run `python app_flux.py --offload --fp8`, the peak memory is under 17GB.

However, there is a difference in image quality between fp8 and bf16, with some degradation in the former.
Specifically, the details of the face may be slightly worse, but the layout is similar. If you want the best results
of PuLID-FLUX, please use bf16 rather than fp8.
of PuLID-FLUX or you have the resources, please use bf16 rather than fp8.
We have included a comparison in the table below.

| | case1 | case2 | case3 | case4 |
Expand Down
17 changes: 12 additions & 5 deletions pulid/pipeline_flux.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@


class PuLIDPipeline(nn.Module):
def __init__(self, dit, device, weight_dtype=torch.bfloat16, *args, **kwargs):
def __init__(self, dit, device, weight_dtype=torch.bfloat16, onnx_provider='gpu', *args, **kwargs):
super().__init__()
self.device = device
self.weight_dtype = weight_dtype
Expand Down Expand Up @@ -68,12 +68,12 @@ def __init__(self, dit, device, weight_dtype=torch.bfloat16, *args, **kwargs):
self.eva_transform_std = eva_transform_std
# antelopev2
snapshot_download('DIAMONIK7777/antelopev2', local_dir='models/antelopev2')
self.app = FaceAnalysis(
name='antelopev2', root='.', providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
)
providers = ['CPUExecutionProvider'] if onnx_provider == 'cpu' \
else ['CUDAExecutionProvider', 'CPUExecutionProvider']
self.app = FaceAnalysis(name='antelopev2', root='.', providers=providers)
self.app.prepare(ctx_id=0, det_size=(640, 640))
self.handler_ante = insightface.model_zoo.get_model('models/antelopev2/glintr100.onnx',
providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
providers=providers)
self.handler_ante.prepare(ctx_id=0)

gc.collect()
Expand All @@ -84,6 +84,13 @@ def __init__(self, dit, device, weight_dtype=torch.bfloat16, *args, **kwargs):
# other configs
self.debug_img_list = []

def components_to_device(self, device):
# everything but pulid_ca
self.face_helper.face_det = self.face_helper.face_det.to(device)
self.face_helper.face_parse = self.face_helper.face_parse.to(device)
self.clip_vision_model = self.clip_vision_model.to(device)
self.pulid_encoder = self.pulid_encoder.to(device)

def load_pretrain(self, pretrain_path=None):
hf_hub_download('guozinan/PuLID', 'pulid_flux_v0.9.0.safetensors', local_dir='models')
ckpt_path = 'models/pulid_flux_v0.9.0.safetensors'
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ einops
ftfy
facexlib
insightface
onnxruntime
onnxruntime-gpu
accelerate
SentencePiece
Expand Down
2 changes: 1 addition & 1 deletion requirements_fp8.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ einops
ftfy
facexlib
insightface
onnxruntime
onnxruntime-gpu
accelerate
SentencePiece
fire
safetensors

0 comments on commit 057ea7b

Please sign in to comment.