Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for PhotoMaker Version 2 #358

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 22 additions & 5 deletions clip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,14 @@ class CLIPTokenizer {
}
}

std::string clean_up_tokenization(std::string &text){

std::regex pattern(R"( ,)");
// Replace " ," with ","
std::string result = std::regex_replace(text, pattern, ",");
return result;
}

std::string decode(const std::vector<int>& tokens) {
std::string text = "";
for (int t : tokens) {
Expand All @@ -351,8 +359,12 @@ class CLIPTokenizer {
std::u32string ts = decoder[t];
// printf("%d, %s \n", t, utf32_to_utf8(ts).c_str());
std::string s = utf32_to_utf8(ts);
if (s.length() >= 4 && ends_with(s, "</w>")) {
text += " " + s.replace(s.length() - 4, s.length() - 1, "");
if (s.length() >= 4 ){
if(ends_with(s, "</w>")) {
text += s.replace(s.length() - 4, s.length() - 1, "") + " ";
}else{
text += s;
}
} else {
text += " " + s;
}
Expand All @@ -364,6 +376,7 @@ class CLIPTokenizer {

// std::string s((char *)bytes.data());
// std::string s = "";
text = clean_up_tokenization(text);
return trim(text);
}

Expand Down Expand Up @@ -751,7 +764,8 @@ class CLIPVisionModel : public GGMLBlock {
blocks["post_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
}

struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values, bool return_pooled = true) {
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values,
bool return_pooled = true) {
// pixel_values: [N, num_channels, image_size, image_size]
auto embeddings = std::dynamic_pointer_cast<CLIPVisionEmbeddings>(blocks["embeddings"]);
auto pre_layernorm = std::dynamic_pointer_cast<LayerNorm>(blocks["pre_layernorm"]);
Expand All @@ -761,14 +775,17 @@ class CLIPVisionModel : public GGMLBlock {
auto x = embeddings->forward(ctx, pixel_values); // [N, num_positions, embed_dim]
x = pre_layernorm->forward(ctx, x);
x = encoder->forward(ctx, x, -1, false);
// print_ggml_tensor(x, true, "ClipVisionModel x: ");
auto last_hidden_state = x;
x = post_layernorm->forward(ctx, x); // [N, n_token, hidden_size]

GGML_ASSERT(x->ne[3] == 1);
GGML_ASSERT(x->ne[3] == 1);
if (return_pooled) {
ggml_tensor* pooled = ggml_cont(ctx, ggml_view_2d(ctx, x, x->ne[0], x->ne[2], x->nb[2], 0));
return pooled; // [N, hidden_size]
} else {
return x; // [N, n_token, hidden_size]
// return x; // [N, n_token, hidden_size]
return last_hidden_state; // [N, n_token, hidden_size]
}
}
};
Expand Down
24 changes: 15 additions & 9 deletions conditioner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "clip.hpp"
#include "t5.hpp"


struct SDCondition {
struct ggml_tensor* c_crossattn = NULL; // aka context
struct ggml_tensor* c_vector = NULL; // aka y
Expand Down Expand Up @@ -44,6 +45,7 @@ struct Conditioner {
// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/sd_hijack_clip.py#L283
struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
SDVersion version = VERSION_SD1;
PMVersion pm_version = VERSION_1;
CLIPTokenizer tokenizer;
ggml_type wtype;
std::shared_ptr<CLIPTextModelRunner> text_model;
Expand All @@ -59,8 +61,9 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
ggml_type wtype,
const std::string& embd_dir,
SDVersion version = VERSION_SD1,
PMVersion pv = VERSION_1,
int clip_skip = -1)
: version(version), tokenizer(version == VERSION_SD2 ? 0 : 49407), embd_dir(embd_dir), wtype(wtype) {
: version(version), pm_version(pv), tokenizer(version == VERSION_SD2 ? 0 : 49407), embd_dir(embd_dir), wtype(wtype) {
if (clip_skip <= 0) {
clip_skip = 1;
if (version == VERSION_SD2 || version == VERSION_SDXL) {
Expand Down Expand Up @@ -159,7 +162,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
tokenize_with_trigger_token(std::string text,
int num_input_imgs,
int32_t image_token,
bool padding = false) {
bool padding = false){
return tokenize_with_trigger_token(text, num_input_imgs, image_token,
text_model->model.n_token, padding);
}
Expand Down Expand Up @@ -268,7 +271,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
std::vector<int> clean_input_ids_tmp;
for (uint32_t i = 0; i < class_token_index[0]; i++)
clean_input_ids_tmp.push_back(clean_input_ids[i]);
for (uint32_t i = 0; i < num_input_imgs; i++)
for (uint32_t i = 0; i < (pm_version == VERSION_2 ? 2*num_input_imgs: num_input_imgs); i++)
clean_input_ids_tmp.push_back(class_token);
for (uint32_t i = class_token_index[0] + 1; i < clean_input_ids.size(); i++)
clean_input_ids_tmp.push_back(clean_input_ids[i]);
Expand All @@ -279,13 +282,16 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
tokens.insert(tokens.end(), clean_input_ids.begin(), clean_input_ids.end());
weights.insert(weights.end(), clean_input_ids.size(), curr_weight);
}
tokens.insert(tokens.begin(), tokenizer.BOS_TOKEN_ID);
weights.insert(weights.begin(), 1.0);
// BUG!! double couting, pad_tokens will add BOS at the beginning
// tokens.insert(tokens.begin(), tokenizer.BOS_TOKEN_ID);
// weights.insert(weights.begin(), 1.0);

tokenizer.pad_tokens(tokens, weights, max_length, padding);

int offset = pm_version == VERSION_2 ? 2*num_input_imgs: num_input_imgs;
for (uint32_t i = 0; i < tokens.size(); i++) {
if (class_idx + 1 <= i && i < class_idx + 1 + num_input_imgs)
// if (class_idx + 1 <= i && i < class_idx + 1 + 2*num_input_imgs) // photomaker V2 has num_tokens(=2)*num_input_imgs
if (class_idx + 1 <= i && i < class_idx + 1 + offset) // photomaker V2 has num_tokens(=2)*num_input_imgs
// hardcode for now
class_token_mask.push_back(true);
else
class_token_mask.push_back(false);
Expand Down Expand Up @@ -530,7 +536,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
int height,
int num_input_imgs,
int adm_in_channels = -1,
bool force_zero_embeddings = false) {
bool force_zero_embeddings = false){
auto image_tokens = convert_token_to_id(trigger_word);
// if(image_tokens.size() == 1){
// printf(" image token id is: %d \n", image_tokens[0]);
Expand Down Expand Up @@ -968,7 +974,7 @@ struct SD3CLIPEmbedder : public Conditioner {
int height,
int num_input_imgs,
int adm_in_channels = -1,
bool force_zero_embeddings = false) {
bool force_zero_embeddings = false){
GGML_ASSERT(0 && "Not implemented yet!");
}

Expand Down
24 changes: 23 additions & 1 deletion docs/photo_maker.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,26 @@ Example:

```bash
bin/sd -m ../models/sdxlUnstableDiffusers_v11.safetensors --vae ../models/sdxl_vae.safetensors --stacked-id-embd-dir ../models/photomaker-v1.safetensors --input-id-images-dir ../assets/photomaker_examples/scarletthead_woman -p "a girl img, retro futurism, retro game art style but extremely beautiful, intricate details, masterpiece, best quality, space-themed, cosmic, celestial, stars, galaxies, nebulas, planets, science fiction, highly detailed" -n "realistic, photo-realistic, worst quality, greyscale, bad anatomy, bad hands, error, text" --cfg-scale 5.0 --sampling-method euler -H 1024 -W 1024 --style-ratio 10 --vae-on-cpu -o output.png
```
```

## PhotoMaker Version 2

[PhotoMaker Version 2 (PMV2)](https://github.com/TencentARC/PhotoMaker/blob/main/README_pmv2.md) has some key improvements. Unfortunately it has a very heavy dependency which makes running it a bit involved in ```SD.cpp```.

Running PMV2 is now a two-step process:

- Run a python script ```face_detect.py``` to obtain **id_embeds** for the given input images
```
python face_detect.py input_image_dir
```
An ```id_embeds.safetensors``` file will be generated in ```input_images_dir```

**Note: this step is only needed to run once; the same ```id_embeds``` can be reused**

- Run the same command as in version 1 but replacing ```photomaker-v1.safetensors``` with ```photomaker-v2.safetensors```.

You can download ```photomaker-v2.safetensors``` from [here](https://huggingface.co/bssrdf/PhotoMakerV2)

- All the command line parameters from Version 1 remain the same for Version 2


88 changes: 88 additions & 0 deletions face_detect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import os
import sys

import numpy as np
import torch
from diffusers.utils import load_image
# pip install insightface==0.7.3
from insightface.app import FaceAnalysis
from insightface.data import get_image as ins_get_image
from safetensors.torch import save_file

###
# https://github.com/cubiq/ComfyUI_IPAdapter_plus/issues/165#issue-2055829543
###
class FaceAnalysis2(FaceAnalysis):
# NOTE: allows setting det_size for each detection call.
# the model allows it but the wrapping code from insightface
# doesn't show it, and people end up loading duplicate models
# for different sizes where there is absolutely no need to
def get(self, img, max_num=0, det_size=(640, 640)):
if det_size is not None:
self.det_model.input_size = det_size

return super().get(img, max_num)

def analyze_faces(face_analysis: FaceAnalysis, img_data: np.ndarray, det_size=(640, 640)):
# NOTE: try detect faces, if no faces detected, lower det_size until it does
detection_sizes = [None] + [(size, size) for size in range(640, 256, -64)] + [(256, 256)]

for size in detection_sizes:
faces = face_analysis.get(img_data, det_size=size)
if len(faces) > 0:
return faces

return []

if __name__ == "__main__":
#face_detector = FaceAnalysis2(providers=['CUDAExecutionProvider'], allowed_modules=['detection', 'recognition'])
face_detector = FaceAnalysis2(providers=['CPUExecutionProvider'], allowed_modules=['detection', 'recognition'])
face_detector.prepare(ctx_id=0, det_size=(640, 640))
#input_folder_name = './scarletthead_woman'
input_folder_name = sys.argv[1]
image_basename_list = os.listdir(input_folder_name)
image_path_list = sorted([os.path.join(input_folder_name, basename) for basename in image_basename_list])

input_id_images = []
for image_path in image_path_list:
input_id_images.append(load_image(image_path))

id_embed_list = []

for img in input_id_images:
img = np.array(img)
img = img[:, :, ::-1]
faces = analyze_faces(face_detector, img)
if len(faces) > 0:
id_embed_list.append(torch.from_numpy((faces[0]['embedding'])))

if len(id_embed_list) == 0:
raise ValueError(f"No face detected in input image pool")

id_embeds = torch.stack(id_embed_list)

# for r in id_embeds:
# print(r)
# #torch.save(id_embeds, input_folder_name+'/id_embeds.pt');
# weights = dict()
# weights["id_embeds"] = id_embeds
# save_file(weights, input_folder_name+'/id_embeds.safetensors')

binary_data = id_embeds.numpy().tobytes()
two = 4
zero = 0
one = 1
tensor_name = "id_embeds"
# Write binary data to a file
with open(input_folder_name+'/id_embeds.bin', "wb") as f:
f.write(two.to_bytes(4, byteorder='little'))
f.write((len(tensor_name)).to_bytes(4, byteorder='little'))
f.write(zero.to_bytes(4, byteorder='little'))
f.write((id_embeds.shape[1]).to_bytes(4, byteorder='little'))
f.write((id_embeds.shape[0]).to_bytes(4, byteorder='little'))
f.write(one.to_bytes(4, byteorder='little'))
f.write(one.to_bytes(4, byteorder='little'))
f.write(tensor_name.encode('ascii'))
f.write(binary_data)


8 changes: 7 additions & 1 deletion ggml_extend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1048,6 +1048,11 @@ struct GGMLRunner {
params_buffer_size / (1024.0 * 1024.0),
ggml_backend_is_cpu(backend) ? "RAM" : "VRAM",
num_tensors);
// printf("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)\n",
// get_desc().c_str(),
// params_buffer_size / (1024.0 * 1024.0),
// ggml_backend_is_cpu(backend) ? "RAM" : "VRAM",
// num_tensors);
return true;
}

Expand Down Expand Up @@ -1217,7 +1222,8 @@ class Linear : public UnaryBlock {
params["weight"] = ggml_new_tensor_2d(ctx, wtype, in_features, out_features);
if (bias) {
params["bias"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_features);
}
}

}

public:
Expand Down
42 changes: 40 additions & 2 deletions model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,33 @@ std::unordered_map<std::string, std::string> vae_decoder_name_map = {
{"first_stage_model.decoder.mid.attn_1.to_v.weight", "first_stage_model.decoder.mid.attn_1.v.weight"},
};

std::unordered_map<std::string, std::string> pmid_v2_name_map = {
{"pmid.qformer_perceiver.perceiver_resampler.layers.0.1.1.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.0.1.1.fc1.weight"},
{"pmid.qformer_perceiver.perceiver_resampler.layers.0.1.3.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.0.1.1.fc2.weight"},
{"pmid.qformer_perceiver.perceiver_resampler.layers.1.1.1.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.1.1.1.fc1.weight"},
{"pmid.qformer_perceiver.perceiver_resampler.layers.1.1.3.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.1.1.1.fc2.weight"},
{"pmid.qformer_perceiver.perceiver_resampler.layers.2.1.1.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.2.1.1.fc1.weight"},
{"pmid.qformer_perceiver.perceiver_resampler.layers.2.1.3.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.2.1.1.fc2.weight"},
{"pmid.qformer_perceiver.perceiver_resampler.layers.3.1.1.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.3.1.1.fc1.weight"},
{"pmid.qformer_perceiver.perceiver_resampler.layers.3.1.3.weight",
"pmid.qformer_perceiver.perceiver_resampler.layers.3.1.1.fc2.weight"},
{"pmid.qformer_perceiver.token_proj.0.bias",
"pmid.qformer_perceiver.token_proj.fc1.bias"},
{"pmid.qformer_perceiver.token_proj.2.bias",
"pmid.qformer_perceiver.token_proj.fc2.bias"},
{"pmid.qformer_perceiver.token_proj.0.weight",
"pmid.qformer_perceiver.token_proj.fc1.weight"},
{"pmid.qformer_perceiver.token_proj.2.weight",
"pmid.qformer_perceiver.token_proj.fc2.weight"},
};

std::string convert_open_clip_to_hf_clip(const std::string& name) {
std::string new_name = name;
std::string prefix;
Expand Down Expand Up @@ -212,6 +239,13 @@ std::string convert_vae_decoder_name(const std::string& name) {
return name;
}

std::string convert_pmid_v2_name(const std::string& name) {
if (pmid_v2_name_map.find(name) != pmid_v2_name_map.end()) {
return pmid_v2_name_map[name];
}
return name;
}

/* If not a SDXL LoRA the unet" prefix will have already been replaced by this
* point and "te2" and "te1" don't seem to appear in non-SDXL only "te_" */
std::string convert_sdxl_lora_name(std::string tensor_name) {
Expand Down Expand Up @@ -435,6 +469,8 @@ std::string convert_tensor_name(std::string name) {
new_name = convert_open_clip_to_hf_clip(name);
} else if (starts_with(name, "first_stage_model.decoder")) {
new_name = convert_vae_decoder_name(name);
} else if (starts_with(name, "pmid.qformer_perceiver")) {
new_name = convert_pmid_v2_name(name);
} else if (starts_with(name, "control_model.")) { // for controlnet pth models
size_t pos = name.find('.');
if (pos != std::string::npos) {
Expand Down Expand Up @@ -954,7 +990,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
}

TensorStorage tensor_storage(prefix + name, type, ne, n_dims, file_index, ST_HEADER_SIZE_LEN + header_size_ + begin);
tensor_storage.reverse_ne();
tensor_storage.reverse_ne();

size_t tensor_data_size = end - begin;

Expand Down Expand Up @@ -1297,7 +1333,7 @@ bool ModelLoader::parse_data_pkl(uint8_t* buffer,
reader.tensor_storage.reverse_ne();
reader.tensor_storage.file_index = file_index;
// if(strcmp(prefix.c_str(), "scarlett") == 0)
// printf(" got tensor %s \n ", reader.tensor_storage.name.c_str());
// printf(" ZIP got tensor %s \n ", reader.tensor_storage.name.c_str());
reader.tensor_storage.name = prefix + reader.tensor_storage.name;
tensor_storages.push_back(reader.tensor_storage);
// LOG_DEBUG("%s", reader.tensor_storage.name.c_str());
Expand Down Expand Up @@ -1333,7 +1369,9 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s
std::string name = zip_entry_name(zip);
size_t pos = name.find("data.pkl");
if (pos != std::string::npos) {

std::string dir = name.substr(0, pos);
printf("ZIP %d, name = %s, dir = %s \n", i, name.c_str(), dir.c_str());
void* pkl_data = NULL;
size_t pkl_size;
zip_entry_read(zip, &pkl_data, &pkl_size);
Expand Down
6 changes: 6 additions & 0 deletions model.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@ enum SDVersion {
VERSION_COUNT,
};

enum PMVersion {
VERSION_1,
VERSION_2,
};

struct TensorStorage {
std::string name;
ggml_type type = GGML_TYPE_F32;
Expand Down Expand Up @@ -156,6 +161,7 @@ class ModelLoader {
bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
ggml_backend_t backend,
std::set<std::string> ignore_tensors = {});

bool save_to_gguf_file(const std::string& file_path, ggml_type type);
bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
Expand Down
Loading
Loading