Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mix precision #8

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
28 changes: 28 additions & 0 deletions Plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import matplotlib.pyplot as plt
import numpy as np
import torch
import argparse
def plot_bit_delta(title):
plt.figure(figsize=(10, 5))
plt.plot(bit_delta, label=f'Bit-Delta {map[args.param_type]}')
plt.plot(svd_delta, label=f'svd Data {map[args.param_type]}')
plt.plot(mix_delta, label=f'Ours {map[args.param_type]}')
plt.title("Comparison of the Cosine Similarity between the Bit-Delta, SVD, and our method with WizardMath-7B-v1.0")
plt.xlabel(f'{map[args.param_type]} of each layer') # X轴标题
plt.ylabel('Cosine Similarity Value') # Y轴标题
plt.legend()
plt.savefig(f'./figures/{map[args.param_type]}_cos_sim.pdf')
plt.show()


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--param_type', type=str, help='finetuned model name')
map = {"q_proj":"Query_proj", "k_proj":"Key_proj","v_proj":"Value_proj","o_proj":"Output_proj","gate_proj":"Gate_proj","up_proj":"Up_proj","down_proj":"Down_proj"}
args = parser.parse_args()

bit_delta = torch.load(f'./statistic/{args.param_type}_bitdelta_cos_sim.pt')
svd_delta = torch.load(f'./statistic/{args.param_type}_svd_cos_sim.pt')
mix_delta = torch.load(f'./statistic/{args.param_type}_mix_cos_sim.pt')

plot_bit_delta('Cosine Similarity of Bit-Delta, svd and mixed Data')
118 changes: 105 additions & 13 deletions bitdelta/diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,18 @@ class BinaryDiff(nn.Module):
def __init__(self, base, finetune):
super().__init__()
diff = finetune - base
outlier = get_outlier(diff, percent=0.02)
set_zero(diff, outlier)
# import pdb; pdb.set_trace()
quantile = diff.float().abs().mean()

mask = torch.ones_like(diff)
mask[diff < 0] = 0
mask = pack(mask.bool().T)

self.register_buffer("mask", mask)
self.register_buffer("base", base.T)
self.register_buffer("outlier", outlier)
self.register_parameter(
"coeff",
nn.Parameter(
Expand All @@ -38,7 +42,40 @@ def forward(self, x):
repeated_mask = self.mask.unsqueeze(0).repeat(x.size(0), 1, 1)
return x @ self.base + self.coeff * binary_bmm(x, repeated_mask)

def compress_diff(base_model, finetuned_model, finetuned_compressed_model):
def set_zero(A, B):
# 复制B中非零值到A的对应位置
mask = B != 0
A[mask] = 0
return A

def get_outlier(tensor, percent=0.5):
# 计算保留的元素数量
num_elements = tensor.numel()
num_to_keep = int(num_elements * percent / 100)

# 展平张量并获取最大和最小的元素的索引
flat_tensor = tensor.flatten()
_, top_indices = torch.topk(flat_tensor, num_to_keep, largest=True)
_, bottom_indices = torch.topk(flat_tensor, num_to_keep, largest=False)

# 创建一个全零张量
result = torch.zeros_like(tensor)

# 仅在指定位置放置最大和最小的元素
result = result.flatten()
result[top_indices] = flat_tensor[top_indices]
result[bottom_indices] = flat_tensor[bottom_indices]
result = result.reshape(tensor.shape)

return result

def copy_nonzero_values(A, B):
# 复制B中非零值到A的对应位置
mask = B != 0
A[mask] = B[mask]
return A

def compress_diff(base_model, finetuned_model, finetuned_compressed_model,layers=None):
def compress_submodule(name, subname, module, submodule):
target_device = submodule.weight.device

Expand All @@ -57,13 +94,29 @@ def compress_submodule(name, subname, module, submodule):
setattr(module, subname, compressed)

# TODO: this can be parallelized
for name, module in finetuned_compressed_model.named_modules():
if "mlp" in name or "self_attn" in name:
for subname, submodule in module.named_children():
if "proj" in subname:
compress_submodule(name, subname, module, submodule)
# flag = False
with torch.no_grad():
for name, module in finetuned_model.named_modules():
if "self_attn" in name or "mlp" in name:
for subname, submodule in module.named_children():
if "proj" in subname:
p , f = base_model.get_submodule(f"{name}.{subname}").weight.detach() , finetuned_model.get_submodule(f"{name}.{subname}").weight.detach()

compressed = BinaryDiff(base=p, finetune=f)
mask, coeff , outlier = compressed.mask, compressed.coeff, compressed.outlier
weight = (unpack(mask)*2-1) * coeff
weight = weight.T.to(outlier.dtype)

copy_nonzero_values(weight, outlier)
# import pdb; pdb.set_trace()
finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(p.to(p.dtype) + weight.to(p.dtype))

finetuned_model.save_pretrained("/home/pingbowen/workspace/delta-compression/BitDelta/save/test")




def save_diff(finetuned_compressed_model, save_dir):
def save_diff(finetuned_compressed_model, save_dir,layers=None,ori_diff=None):
diff_dict = {}

for name, module in finetuned_compressed_model.named_modules():
Expand All @@ -79,9 +132,10 @@ def save_diff(finetuned_compressed_model, save_dir):
torch.save(diff_dict, save_dir)

@torch.no_grad()
def load_diff(model, diff_dir):
def load_diff(model, diff_dir,ori_diff):
device = model.device
diff_dict = torch.load(diff_dir)
# ori_diff = torch.load(ori_diff)

for name, module in model.named_modules():
if name + ".mask" in diff_dict:
Expand All @@ -91,10 +145,15 @@ def load_diff(model, diff_dir):
# setattr(module, "mask", mask)
# setattr(module, "coeff", coeff)
weight = (unpack(mask)*2-1) * coeff
weight_fp16 = decomposition(ori_diff[name + ".weight"].to(torch.float32), dim=64).to(torch.bfloat16)
# import pdb; pdb.set_trace()

module.weight.add_(weight.T.to(module.weight.dtype))
module.weight.add_(weight_fp16.to(module.weight.dtype) + weight.T.to(module.weight.dtype))
elif name + ".weight" in diff_dict:
module.weight = nn.Parameter(diff_dict[name + ".weight"].to(device).to(module.weight.dtype))

# if "mlp" in name:
# import pdb; pdb.set_trace()

elif name + '.A' in diff_dict:
A = diff_dict[name + '.A'].to(device)
Expand All @@ -105,11 +164,44 @@ def load_diff(model, diff_dir):

model.config.vocab_size = model.lm_head.weight.size(0)

def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, device):
def decomposition(masked_input_tensor,dim=None,st=None,ed=None):
U , S , V = torch.svd(masked_input_tensor.to(torch.float32))

if dim is not None:
U , S , V = U[:, :dim],S[:dim] ,V[:, :dim]

if st is not None and ed is not None:
U , S , V = U[:, st:ed],S[st:ed] ,V[:, st:ed]

return torch.mm(torch.mm(U, torch.diag(S)), V.t())

def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, device,layers=None,ori_diff=None):
base_model = get_model(base_model_name, device)
tokenizer = get_tokenizer(finetuned_model_name)
load_diff(base_model, diff_dir)


finetuned_model = get_model(finetuned_model_name, device)
# params = {}

# for k ,v in finetuned_model.named_parameters():
# if layers is not None:
# for layer in layers:
# if layer in k:
# if "mlp" in k or "self_attn" in k:
# delta = v.detach().cpu() - base_model.get_submodule(k.replace('.weight',"")).weight.detach().cpu()
# dim = 128
# if "mlp" in k:
# dim = int(dim * 1.45)
# # import pdb; pdb.set_trace()
# params[k] = decomposition(delta.to(torch.float32), dim).to(torch.bfloat16)

# dict(base_model.named_parameters())['model.layers.0.self_attn.o_proj.weight']

# with torch.no_grad():
# for param in params:
# base_model.get_submodule(param.replace('.weight',"")).weight.add_(params[param].detach().to(device))

load_diff(base_model, diff_dir,ori_diff=ori_diff)

base_model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

Expand Down
38 changes: 38 additions & 0 deletions bitdelta/diff.py.rej
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
diff a/bitdelta/diff.py b/bitdelta/diff.py (rejected hunks)
@@ -73,24 +86,31 @@ def save_diff(finetuned_compressed_model, save_dir):
diff_dict[name + ".coeff"] = module.coeff.cpu()

for name, param in finetuned_compressed_model.named_parameters():
+ if "mlp" in name or "self_attn" in name:
+ if Pass(layers,name) == True:
+ continue
+
if param.requires_grad:
diff_dict[name] = param.cpu()
-
+
+ # import pdb; pdb.set_trace()
torch.save(diff_dict, save_dir)

@torch.no_grad()
def load_diff(model, diff_dir):
device = model.device
diff_dict = torch.load(diff_dir)
-
+
for name, module in model.named_modules():
if name + ".mask" in diff_dict:
coeff = diff_dict[name + ".coeff"].to(device)
mask = diff_dict[name + ".mask"].to(device)

- setattr(module, "mask", mask)
- setattr(module, "coeff", coeff)
- # module.weight.add_((mask * coeff).to(module.weight.dtype))
+ # setattr(module, "mask", mask)
+ # setattr(module, "coeff", coeff)
+ weight = (unpack(mask)*2-1) * coeff
+
+ module.weight.add_(weight.T.to(module.weight.dtype))
elif name + ".weight" in diff_dict:
module.weight = nn.Parameter(diff_dict[name + ".weight"].to(device).to(module.weight.dtype))

Loading