diff --git a/Plot.py b/Plot.py new file mode 100644 index 0000000..a6d550d --- /dev/null +++ b/Plot.py @@ -0,0 +1,28 @@ +import matplotlib.pyplot as plt +import numpy as np +import torch +import argparse +def plot_bit_delta(title): + plt.figure(figsize=(10, 5)) + plt.plot(bit_delta, label=f'Bit-Delta {map[args.param_type]}') + plt.plot(svd_delta, label=f'svd Data {map[args.param_type]}') + plt.plot(mix_delta, label=f'Ours {map[args.param_type]}') + plt.title("Comparison of the Cosine Similarity between the Bit-Delta, SVD, and our method with WizardMath-7B-v1.0") + plt.xlabel(f'{map[args.param_type]} of each layer') # X轴标题 + plt.ylabel('Cosine Similarity Value') # Y轴标题 + plt.legend() + plt.savefig(f'./figures/{map[args.param_type]}_cos_sim.pdf') + plt.show() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--param_type', type=str, help='finetuned model name') + map = {"q_proj":"Query_proj", "k_proj":"Key_proj","v_proj":"Value_proj","o_proj":"Output_proj","gate_proj":"Gate_proj","up_proj":"Up_proj","down_proj":"Down_proj"} + args = parser.parse_args() + + bit_delta = torch.load(f'./statistic/{args.param_type}_bitdelta_cos_sim.pt') + svd_delta = torch.load(f'./statistic/{args.param_type}_svd_cos_sim.pt') + mix_delta = torch.load(f'./statistic/{args.param_type}_mix_cos_sim.pt') + + plot_bit_delta('Cosine Similarity of Bit-Delta, svd and mixed Data') \ No newline at end of file diff --git a/bitdelta/diff.py b/bitdelta/diff.py index c2b03ce..594b936 100644 --- a/bitdelta/diff.py +++ b/bitdelta/diff.py @@ -9,14 +9,18 @@ class BinaryDiff(nn.Module): def __init__(self, base, finetune): super().__init__() diff = finetune - base + outlier = get_outlier(diff, percent=0.02) + set_zero(diff, outlier) + # import pdb; pdb.set_trace() quantile = diff.float().abs().mean() mask = torch.ones_like(diff) mask[diff < 0] = 0 mask = pack(mask.bool().T) - + self.register_buffer("mask", mask) self.register_buffer("base", base.T) + self.register_buffer("outlier", outlier) self.register_parameter( "coeff", nn.Parameter( @@ -38,7 +42,40 @@ def forward(self, x): repeated_mask = self.mask.unsqueeze(0).repeat(x.size(0), 1, 1) return x @ self.base + self.coeff * binary_bmm(x, repeated_mask) -def compress_diff(base_model, finetuned_model, finetuned_compressed_model): +def set_zero(A, B): + # 复制B中非零值到A的对应位置 + mask = B != 0 + A[mask] = 0 + return A + +def get_outlier(tensor, percent=0.5): + # 计算保留的元素数量 + num_elements = tensor.numel() + num_to_keep = int(num_elements * percent / 100) + + # 展平张量并获取最大和最小的元素的索引 + flat_tensor = tensor.flatten() + _, top_indices = torch.topk(flat_tensor, num_to_keep, largest=True) + _, bottom_indices = torch.topk(flat_tensor, num_to_keep, largest=False) + + # 创建一个全零张量 + result = torch.zeros_like(tensor) + + # 仅在指定位置放置最大和最小的元素 + result = result.flatten() + result[top_indices] = flat_tensor[top_indices] + result[bottom_indices] = flat_tensor[bottom_indices] + result = result.reshape(tensor.shape) + + return result + +def copy_nonzero_values(A, B): + # 复制B中非零值到A的对应位置 + mask = B != 0 + A[mask] = B[mask] + return A + +def compress_diff(base_model, finetuned_model, finetuned_compressed_model,layers=None): def compress_submodule(name, subname, module, submodule): target_device = submodule.weight.device @@ -57,13 +94,29 @@ def compress_submodule(name, subname, module, submodule): setattr(module, subname, compressed) # TODO: this can be parallelized - for name, module in finetuned_compressed_model.named_modules(): - if "mlp" in name or "self_attn" in name: - for subname, submodule in module.named_children(): - if "proj" in subname: - compress_submodule(name, subname, module, submodule) + # flag = False + with torch.no_grad(): + for name, module in finetuned_model.named_modules(): + if "self_attn" in name or "mlp" in name: + for subname, submodule in module.named_children(): + if "proj" in subname: + p , f = base_model.get_submodule(f"{name}.{subname}").weight.detach() , finetuned_model.get_submodule(f"{name}.{subname}").weight.detach() + + compressed = BinaryDiff(base=p, finetune=f) + mask, coeff , outlier = compressed.mask, compressed.coeff, compressed.outlier + weight = (unpack(mask)*2-1) * coeff + weight = weight.T.to(outlier.dtype) + + copy_nonzero_values(weight, outlier) + # import pdb; pdb.set_trace() + finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(p.to(p.dtype) + weight.to(p.dtype)) + + finetuned_model.save_pretrained("/home/pingbowen/workspace/delta-compression/BitDelta/save/test") + + + -def save_diff(finetuned_compressed_model, save_dir): +def save_diff(finetuned_compressed_model, save_dir,layers=None,ori_diff=None): diff_dict = {} for name, module in finetuned_compressed_model.named_modules(): @@ -79,9 +132,10 @@ def save_diff(finetuned_compressed_model, save_dir): torch.save(diff_dict, save_dir) @torch.no_grad() -def load_diff(model, diff_dir): +def load_diff(model, diff_dir,ori_diff): device = model.device diff_dict = torch.load(diff_dir) + # ori_diff = torch.load(ori_diff) for name, module in model.named_modules(): if name + ".mask" in diff_dict: @@ -91,10 +145,15 @@ def load_diff(model, diff_dir): # setattr(module, "mask", mask) # setattr(module, "coeff", coeff) weight = (unpack(mask)*2-1) * coeff + weight_fp16 = decomposition(ori_diff[name + ".weight"].to(torch.float32), dim=64).to(torch.bfloat16) + # import pdb; pdb.set_trace() - module.weight.add_(weight.T.to(module.weight.dtype)) + module.weight.add_(weight_fp16.to(module.weight.dtype) + weight.T.to(module.weight.dtype)) elif name + ".weight" in diff_dict: module.weight = nn.Parameter(diff_dict[name + ".weight"].to(device).to(module.weight.dtype)) + + # if "mlp" in name: + # import pdb; pdb.set_trace() elif name + '.A' in diff_dict: A = diff_dict[name + '.A'].to(device) @@ -105,11 +164,44 @@ def load_diff(model, diff_dir): model.config.vocab_size = model.lm_head.weight.size(0) -def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, device): +def decomposition(masked_input_tensor,dim=None,st=None,ed=None): + U , S , V = torch.svd(masked_input_tensor.to(torch.float32)) + + if dim is not None: + U , S , V = U[:, :dim],S[:dim] ,V[:, :dim] + + if st is not None and ed is not None: + U , S , V = U[:, st:ed],S[st:ed] ,V[:, st:ed] + + return torch.mm(torch.mm(U, torch.diag(S)), V.t()) + +def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, device,layers=None,ori_diff=None): base_model = get_model(base_model_name, device) tokenizer = get_tokenizer(finetuned_model_name) - load_diff(base_model, diff_dir) - + + finetuned_model = get_model(finetuned_model_name, device) + # params = {} + + # for k ,v in finetuned_model.named_parameters(): + # if layers is not None: + # for layer in layers: + # if layer in k: + # if "mlp" in k or "self_attn" in k: + # delta = v.detach().cpu() - base_model.get_submodule(k.replace('.weight',"")).weight.detach().cpu() + # dim = 128 + # if "mlp" in k: + # dim = int(dim * 1.45) + # # import pdb; pdb.set_trace() + # params[k] = decomposition(delta.to(torch.float32), dim).to(torch.bfloat16) + + # dict(base_model.named_parameters())['model.layers.0.self_attn.o_proj.weight'] + + # with torch.no_grad(): + # for param in params: + # base_model.get_submodule(param.replace('.weight',"")).weight.add_(params[param].detach().to(device)) + + load_diff(base_model, diff_dir,ori_diff=ori_diff) + base_model.save_pretrained(save_dir) tokenizer.save_pretrained(save_dir) diff --git a/bitdelta/diff.py.rej b/bitdelta/diff.py.rej new file mode 100644 index 0000000..5f60f5f --- /dev/null +++ b/bitdelta/diff.py.rej @@ -0,0 +1,38 @@ +diff a/bitdelta/diff.py b/bitdelta/diff.py (rejected hunks) +@@ -73,24 +86,31 @@ def save_diff(finetuned_compressed_model, save_dir): + diff_dict[name + ".coeff"] = module.coeff.cpu() + + for name, param in finetuned_compressed_model.named_parameters(): ++ if "mlp" in name or "self_attn" in name: ++ if Pass(layers,name) == True: ++ continue ++ + if param.requires_grad: + diff_dict[name] = param.cpu() +- ++ ++ # import pdb; pdb.set_trace() + torch.save(diff_dict, save_dir) + + @torch.no_grad() + def load_diff(model, diff_dir): + device = model.device + diff_dict = torch.load(diff_dir) +- ++ + for name, module in model.named_modules(): + if name + ".mask" in diff_dict: + coeff = diff_dict[name + ".coeff"].to(device) + mask = diff_dict[name + ".mask"].to(device) + +- setattr(module, "mask", mask) +- setattr(module, "coeff", coeff) +- # module.weight.add_((mask * coeff).to(module.weight.dtype)) ++ # setattr(module, "mask", mask) ++ # setattr(module, "coeff", coeff) ++ weight = (unpack(mask)*2-1) * coeff ++ ++ module.weight.add_(weight.T.to(module.weight.dtype)) + elif name + ".weight" in diff_dict: + module.weight = nn.Parameter(diff_dict[name + ".weight"].to(device).to(module.weight.dtype)) + diff --git a/bitdelta/diff2.py b/bitdelta/diff2.py new file mode 100644 index 0000000..1afabdc --- /dev/null +++ b/bitdelta/diff2.py @@ -0,0 +1,297 @@ +import torch +import torch.nn as nn +import gc +import torch.nn.functional as F +from bitdelta.binary_gemm_kernel import pack, unpack, binary_bmm +from bitdelta.utils import get_model, get_tokenizer + +# 离群值抽出之后 原来位置设定成多少,如果设置成0会让分母增大 +# U, V + +class BinaryDiff(nn.Module): + def __init__(self, weight): + super().__init__() + diff = weight + quantile = diff.float().abs().mean() + + mask = torch.ones_like(diff) + mask[diff < 0] = 0 + mask = pack(mask.bool().T) + + self.register_buffer("mask", mask) + # self.register_buffer("base", base.T) + self.register_parameter( + "coeff", + nn.Parameter( + torch.tensor( + quantile, + dtype=torch.float32, + requires_grad=True, + device=weight.device, + ) + ), + ) + # del base, finetune, diff + + def forward(self, x): + # print(x.shape, self.base.shape, self.coeff.shape, self.mask.shape) + # [B, seq, in] @ [in, out] + [B, seq, in] @ [B, in/32, out] + + # TODO: This can be faster + repeated_mask = self.mask.unsqueeze(0).repeat(x.size(0), 1, 1) + return x @ self.base + self.coeff * binary_bmm(x, repeated_mask) + +def Pass(layers=None,name=None): + if layers is not None: + for layer in layers: + if layer in name: + return True + return False + + +def solve_orthogonal(p, f): + # 计算x + delta ,n , sacled_p = f - p, p.shape[-1],p + + # import pdb; pdb.set_trace() + + for i in range(n): + p_i,f_i = p[:,i],f[:,i] + dot_fp , dot_pd = torch.dot(f_i, p_i) , torch.dot(p_i, delta[:,i]) + + if dot_fp == 0 or dot_pd == 0: # p_i或f_i是零向量,因为低秩, 边界p_i与delta_i直接正交 + continue + + dot_pp = torch.dot(p_i, p_i) + x = dot_fp / dot_pp if dot_pp != 0 else None + + + # 计算(f - xp) + with torch.no_grad(): + delta[:,i].data.copy_(f_i - x * p_i) if x is not None else None + sacled_p[:,i].data.copy_(sacled_p[:,i].data * x) if x is not None else None + + # import pdb; pdb.set_trace() + + return delta , sacled_p + +def get_outlier(tensor, percent=0.5): + # 计算保留的元素数量 + num_elements = tensor.numel() + num_to_keep = int(num_elements * percent / 100) + + # 展平张量并获取最大和最小的元素的索引 + flat_tensor = tensor.flatten() + _, top_indices = torch.topk(flat_tensor, num_to_keep, largest=True) + _, bottom_indices = torch.topk(flat_tensor, num_to_keep, largest=False) + + # 创建一个全零张量 + result = torch.zeros_like(tensor) + + # 仅在指定位置放置最大和最小的元素 + result = result.flatten() + result[top_indices] = flat_tensor[top_indices] + result[bottom_indices] = flat_tensor[bottom_indices] + result = result.reshape(tensor.shape) + + return result + +def copy_nonzero_values(A, B): + # 复制B中非零值到A的对应位置 + mask = B != 0 + A[mask] = B[mask] + return A + +def compress_diff(base_model, finetuned_model, save_dir,args): + def compress_submodule(name, subname, module, submodule): + target_device = submodule.weight.device + + base_weight = base_model.get_submodule(f"{name}.{subname}").weight.detach().to(target_device) + finetuned_weight = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(target_device) + + compressed = BinaryDiff( + base=base_weight, + finetune=finetuned_weight, + ).to(target_device) + + del submodule, base_weight + setattr(module, subname, None) + gc.collect() + torch.cuda.empty_cache() + setattr(module, subname, compressed) + + # TODO: 根据thresh 选择压缩比例 + param_dict = dict() + for name, module in finetuned_model.named_modules(): + # import pdb; pdb.set_trace() + if "vision" in name: + continue + + if "self_attn" in name or "mlp" in name: + for subname, submodule in module.named_children(): + + with torch.no_grad(): + if "proj" in subname: + p = base_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device) + f = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device) + + delta , outlier_U, outlier_V = f - p , None, None + + if args.choice == "mix": + dim , fp16_col = 1024, 64 + + if "self_attn" in name: + U,S,V,outlier_U,outlier_V = decomposition(delta,dim=dim,name=name,attn_outlier=args.attn_outlier) + else: + dim , fp16_col = 1024 , 128 + # delta , scaled_p = solve_orthogonal(p, f) + U,S,V,outlier_U,outlier_V = decomposition(delta,dim=dim,name=name,mlp_outlier=args.mlp_outlier) + + compressed_U, compressed_V = BinaryDiff(weight=U[:,fp16_col:]).to(f.device), BinaryDiff(weight=V[:,fp16_col:]).to(f.device) + U_mask, U_coeff, V_mask, V_coeff = compressed_U.mask, compressed_U.coeff, compressed_V.mask, compressed_V.coeff + weight_U , weight_V = (unpack(U_mask)*2-1) * U_coeff, (unpack(V_mask)*2-1) * V_coeff + U[:,fp16_col:] , V[:,fp16_col:] = weight_U.T, weight_V.T + + + if outlier_U is not None and outlier_V is not None: + copy_nonzero_values(U[:,fp16_col:], outlier_U) , copy_nonzero_values(V[:,fp16_col:], outlier_V) + # import pdb; pdb.set_trace() + + delta = U @ torch.diag(S) @ V.t() + elif args.choice == "bit": + compressed = BinaryDiff(weight=delta).to(f.device) + mask , coeff = compressed.mask, compressed.coeff + delta = (unpack(mask)*2-1) * coeff + delta = delta.T + + if "llava" in args.finetuned_model.lower(): + param_dict[f"{name}.{subname}" + ".weight"] = p + delta.to(p.dtype) + + elif args.choice == "svd": + dim = args.dim + + if "mlp" in name: + dim = int(dim * args.scale_factor) + + U , S , V = decomposition((f - p).clone().detach(),dim=dim) + param_dict[f"{name}.{subname}" + ".base"] = p + param_dict[f"{name}.{subname}" + ".U"] = U.to(p.dtype) + param_dict[f"{name}.{subname}" + ".S"] = S.to(p.dtype) + param_dict[f"{name}.{subname}" + ".V"] = V.to(p.dtype) + # if "llava" in args.finetuned_model.lower(): + # U , S , V = decomposition((f - p).clone().detach(),dim=1024) + # param_dict[f"{name}.{subname}" + ".base"] = p + # param_dict[f"{name}.{subname}" + ".U"] = U.to(p.dtype) + # param_dict[f"{name}.{subname}" + ".S"] = S.to(p.dtype) + # param_dict[f"{name}.{subname}" + ".V"] = V.to(p.dtype) + + finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(p.to(p.dtype) + delta.to(p.dtype)) + + if "llava" in args.finetuned_model.lower() and args.choice == "bit": + torch.save(param_dict, args.svd_dict) + + if args.choice == "svd": + torch.save(param_dict, args.svd_dict) + + + finetuned_model.to(torch.bfloat16) + finetuned_model.save_pretrained(save_dir) + +def save_diff(finetuned_compressed_model, save_dir,layers=None,ori_diff=None): + diff_dict = {} + + for name, module in finetuned_compressed_model.named_modules(): + if isinstance(module, BinaryDiff): + # diff_dict[name + ".mask"] = (module.mask == 1).bool().cpu() + diff_dict[name + ".mask"] = module.mask.cpu() + diff_dict[name + ".coeff"] = module.coeff.cpu() + + for name, param in finetuned_compressed_model.named_parameters(): + if param.requires_grad: + diff_dict[name] = param.cpu() + + torch.save(diff_dict, save_dir) + +@torch.no_grad() +def load_diff(model, diff_dir,ori_diff): + device = model.device + diff_dict = torch.load(diff_dir) + # ori_diff = torch.load(ori_diff) + + for name, module in model.named_modules(): + if name + ".mask" in diff_dict: + coeff = diff_dict[name + ".coeff"].to(device) + mask = diff_dict[name + ".mask"].to(device) + + # setattr(module, "mask", mask) + # setattr(module, "coeff", coeff) + weight = (unpack(mask)*2-1) * coeff + weight_fp16 = decomposition(ori_diff[name + ".weight"].to(torch.float32), dim=64).to(torch.bfloat16) + # import pdb; pdb.set_trace() + + module.weight.add_(weight_fp16.to(module.weight.dtype) + weight.T.to(module.weight.dtype)) + elif name + ".weight" in diff_dict: + module.weight = nn.Parameter(diff_dict[name + ".weight"].to(device).to(module.weight.dtype)) + + # if "mlp" in name: + # import pdb; pdb.set_trace() + + elif name + '.A' in diff_dict: + A = diff_dict[name + '.A'].to(device) + B = diff_dict[name + '.B'].to(device) + + mask = (A @ B).T + module.weight.add_(mask.to(module.weight.dtype)) + + model.config.vocab_size = model.lm_head.weight.size(0) + +def set_zero(A, B): + # 复制B中非零值到A的对应位置 + mask = B != 0 + A[mask] = 0 + return A + + +def decomposition(masked_input_tensor,dim=None,name=None,attn_outlier=0.1,mlp_outlier=0.1): + U , S , V = torch.svd(masked_input_tensor.to(torch.float32)) + + outlier_U , outlier_V = None, None + + if dim is not None: + U , S , V = U[:, :dim],S[:dim] ,V[:, :dim] + + # if "self_attn" in name: + # outlier_U = get_outlier(U[:,64:], percent=attn_outlier) + # outlier_V = get_outlier(V[:,64:], percent=attn_outlier) + + # set_zero(U[:,64:], outlier_U) + # # import pdb; pdb.set_trace() + # set_zero(V[:,64:], outlier_V) + + # else: + # outlier_U = get_outlier(U[:,128:], percent=mlp_outlier) + # outlier_V = get_outlier(V[:,128:], percent=mlp_outlier) + + # set_zero(U[:,128:], outlier_U) + # set_zero(V[:,128:], outlier_V) + + # max_val, min_val, mean_abs_val = round(torch.max(U).item(),4), round(torch.min(U).item(),4), round(torch.mean(torch.abs(U)).item(),4) + + # print(f"max_val {max_val} pos_min {round(torch.min(outlier[outlier > 0]).item(),4)} mean_abs_val {mean_abs_val} ratio {round(torch.min(outlier[outlier > 0]).item() / mean_abs_val,4)}") + # import pdb; pdb.set_trace() + return U, S, V # , outlier_U, outlier_V + +def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, device,layers=None,ori_diff=None): + base_model = get_model(base_model_name, device) + tokenizer = get_tokenizer(finetuned_model_name) + + finetuned_model = get_model(finetuned_model_name, device) + # params = {} + + load_diff(base_model, diff_dir,ori_diff=ori_diff) + + base_model.save_pretrained(save_dir) + tokenizer.save_pretrained(save_dir) + + del base_model + diff --git a/bitdelta/train.py b/bitdelta/train.py index 946dafb..a4a44e5 100644 --- a/bitdelta/train.py +++ b/bitdelta/train.py @@ -23,6 +23,13 @@ base_model = get_model(args.base_model, args.base_model_device, args.base_model_memory_map) finetuned_model = get_model(args.finetuned_model, args.finetuned_model_device, args.finetuned_model_memory_map) +def original_diff(base_model, finetuned_model): + origin_diff = {} + for k, v in finetuned_model.named_parameters(): + if "mlp" in k or "self_attn" in k: + origin_diff[k] = v.detach().cpu() - base_model.get_submodule(k.replace('.weight',"")).weight.detach().cpu() + return origin_diff + # get corr/stddev stats if args.debug: print(f"finding corr/stddev stats...") @@ -37,8 +44,12 @@ finetuned_compressed_model = get_model(args.finetuned_model, args.finetuned_compressed_model_device, args.finetuned_compressed_model_memory_map) print(f"compressing diff...") -compress_diff(base_model, finetuned_model, finetuned_compressed_model) +compress_diff(base_model, finetuned_model, finetuned_compressed_model,layers=args.layers) + +tokenizer.save_pretrained("/home/pingbowen/workspace/delta-compression/BitDelta/save/test") + +''' train_num_samples = args.batch_size * args.num_steps train_dataset = get_dataset( args.dataset_name, @@ -55,37 +66,38 @@ ) # save untrained delta -save_diff(finetuned_compressed_model, os.path.join(args.save_dir, "diff_untrained.pt")) +save_diff(finetuned_compressed_model, os.path.join(args.save_dir, "diff_untrained.pt"),layers=args.layers) -optimizer = torch.optim.AdamW(finetuned_compressed_model.parameters(), lr=args.lr) -scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.num_steps) +if args.train: + optimizer = torch.optim.AdamW(finetuned_compressed_model.parameters(), lr=args.lr) + scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.num_steps) -bar = tqdm(train_dataloader) + bar = tqdm(train_dataloader) -train_loss_list = [] + train_loss_list = [] -# Train loop -for step, batch in enumerate(bar): - batch1 = {k: v.to(finetuned_model.device) for k, v in batch.items()} - with torch.inference_mode(): - finetuned_outputs = finetuned_model(**batch1) + # Train loop + for step, batch in enumerate(bar): + batch1 = {k: v.to(finetuned_model.device) for k, v in batch.items()} + with torch.inference_mode(): + finetuned_outputs = finetuned_model(**batch1) - batch2 = {k: v.to(finetuned_compressed_model.device) for k, v in batch.items()} - finetuned_compressed_outputs = finetuned_compressed_model(**batch2) + batch2 = {k: v.to(finetuned_compressed_model.device) for k, v in batch.items()} + finetuned_compressed_outputs = finetuned_compressed_model(**batch2) - loss = F.mse_loss( - finetuned_outputs.logits.clone().to(finetuned_compressed_outputs.logits.device), - finetuned_compressed_outputs.logits, - ) + loss = F.mse_loss( + finetuned_outputs.logits.clone().to(finetuned_compressed_outputs.logits.device), + finetuned_compressed_outputs.logits, + ) - train_loss_list.append(loss.item()) + train_loss_list.append(loss.item()) - optimizer.zero_grad() - loss.backward() - optimizer.step() - scheduler.step() + optimizer.zero_grad() + loss.backward() + optimizer.step() + scheduler.step() - bar.set_description(f"train loss: {loss.item()}") + bar.set_description(f"train loss: {loss.item()}") # save loss list @@ -93,14 +105,17 @@ with open(os.path.join(args.save_dir, f"train_loss_{args.num_groups}.json"), "w") as f: json.dump(train_loss_list, f) -# save trained delta -save_diff(finetuned_compressed_model, os.path.join(args.save_dir, "diff.pt")) +ori_diff = original_diff(base_model, finetuned_model) + +# # save trained delta +save_diff(finetuned_compressed_model, os.path.join(args.save_dir, "diff.pt"),layers=args.layers) del base_model, finetuned_model, finetuned_compressed_model torch.cuda.empty_cache() if args.save_full_model: print("saving uncalibrated model") - save_full_model(args.base_model, args.finetuned_model, os.path.join(args.save_dir, "diff_untrained.pt"), os.path.join(args.save_dir, "uncalibrated_model"), device="cpu") - print("saving calibrated model") - save_full_model(args.base_model, args.finetuned_model, os.path.join(args.save_dir, "diff.pt"), os.path.join(args.save_dir, "calibrated_model"), device="cpu") + save_full_model(args.base_model, args.finetuned_model, os.path.join(args.save_dir, "diff_untrained.pt"), os.path.join(args.save_dir, f"uncalibrated_model"), device="cpu",layers=args.layers,ori_diff=ori_diff) + # print("saving calibrated model") + # save_full_model(args.base_model, args.finetuned_model, os.path.join(args.save_dir, "diff.pt"), os.path.join(args.save_dir, "calibrated_model"), device="cpu") +''' \ No newline at end of file diff --git a/bitdelta/train2.py b/bitdelta/train2.py new file mode 100644 index 0000000..7073a57 --- /dev/null +++ b/bitdelta/train2.py @@ -0,0 +1,33 @@ +import os + +import torch + +import torch.nn.functional as F +from bitdelta.diff2 import compress_diff, save_diff, save_full_model +from bitdelta.misc import find_corr_stddev + +from bitdelta.utils import get_model, parse_args, get_tokenizer,load_llava +from tqdm import tqdm +from bitdelta.data import get_dataset, get_dataloader + +import json + +args = parse_args() + +# create save_dir if it doesn't exist +os.makedirs(args.save_dir, exist_ok=True) + +tokenizer = get_tokenizer(args.finetuned_model) + +with torch.no_grad(): + base_model = get_model(args.base_model, args.base_model_device, args.base_model_memory_map) + if "llava" in args.finetuned_model.lower(): + finetuned_model = load_llava(args.finetuned_model,device="cuda" if torch.cuda.is_available() else "cpu") + else: + finetuned_model = get_model(args.finetuned_model, args.finetuned_model_device, args.finetuned_model_memory_map) + + +print(f"compressing diff...") +compress_diff(base_model, finetuned_model, args.save_dir,args) + +tokenizer.save_pretrained(args.save_dir) \ No newline at end of file diff --git a/bitdelta/utils.py b/bitdelta/utils.py index a7c55ea..3fff503 100644 --- a/bitdelta/utils.py +++ b/bitdelta/utils.py @@ -1,12 +1,52 @@ import argparse import transformers import torch -from transformers import AutoConfig, AutoModelForCausalLM +from transformers import AutoConfig, AutoModelForCausalLM,AutoTokenizer from accelerate import infer_auto_device_map, init_empty_weights +import os +try: + from llava.model import * + from llava.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN +except: pass + +def load_llava(path,device): + tokenizer = AutoTokenizer.from_pretrained(path, use_fast=False) + model = LlavaLlamaForCausalLM.from_pretrained( + path, + low_cpu_mem_usage=True, + torch_dtype=torch.bfloat16, + ).to(device) + + + image_processor = None + + if 'llava' in path.lower(): + mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False) + mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True) + if mm_use_im_patch_token: + tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True) + if mm_use_im_start_end: + tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True) + model.resize_token_embeddings(len(tokenizer)) + + vision_tower = model.get_vision_tower() + if not vision_tower.is_loaded: + vision_tower.load_model(device_map=device) + if device != 'auto': + vision_tower.to(device=device, dtype=torch.float16) + image_processor = vision_tower.image_processor + + if hasattr(model.config, "max_sequence_length"): + context_len = model.config.max_sequence_length + else: + context_len = 2048 + + return model + def parse_args(): parser = argparse.ArgumentParser(description="BitDelta") - + # # models parser.add_argument( "--finetuned_model", type=str, default="lmsys/vicuna-7b-v1.5-16k" @@ -14,16 +54,24 @@ def parse_args(): parser.add_argument("--base_model", type=str, default="meta-llama/Llama-2-7b-hf") # train params + parser.add_argument("--svd_dict", type=str, default="") parser.add_argument("--dataset_name", type=str, default="c4") parser.add_argument("--subset", type=str, default="en") parser.add_argument("--data_dir", type=str, default="en") parser.add_argument("--split", type=str, default="train") parser.add_argument("--lr", type=float, default=1e-4) parser.add_argument("--num_steps", type=int, default=100) + parser.add_argument("--dim", type=int, default=1024) + parser.add_argument("--scale_factor", type=float, default=1.45) parser.add_argument("--batch_size", type=int, default=4) + parser.add_argument("--layers", nargs='+', default=None) + parser.add_argument("--save_num", type=int, default=0) parser.add_argument("--max_length", type=int, default=128) parser.add_argument("--save_dir", type=str, required=True) - + parser.add_argument("--train", action="store_true") + parser.add_argument("--attn_outlier", type=float,default=1e-4) + parser.add_argument("--mlp_outlier", type=float,default=1e-4) + parser.add_argument("--choice", type=str,choices=['mix','bit','svd'],default=None) # device management parser.add_argument("--base_model_device", type=str, default="0") diff --git a/cosine_sim_check.py b/cosine_sim_check.py new file mode 100644 index 0000000..788937a --- /dev/null +++ b/cosine_sim_check.py @@ -0,0 +1,281 @@ +import os +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" +import torch +from torch import nn +import gc +import torch.nn.functional as F +from bitdelta.diff import save_diff, save_full_model +from bitdelta.misc import find_corr_stddev +from bitdelta.binary_gemm_kernel import pack, unpack, binary_bmm +from bitdelta.utils import get_model, parse_args, get_tokenizer +from tqdm import tqdm +from bitdelta.data import get_dataset, get_dataloader + +import json +import transformers + +import re +import random +import numpy as np + +def set_random_seed(seed: int = 0): + """ + set random seed + :param seed: int, random seed + :return: + """ + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + +set_random_seed(seed=0) + +def get_param_names_to_merge(input_param_names: list, exclude_param_names_regex: list): + """ + get the names of parameters that need to be merged + :param input_param_names: list, names of input parameters + :param exclude_param_names_regex: list, regular expression of names of parameters that need to be excluded + :return: + """ + param_names_to_merge = [] + for param_name in input_param_names: + exclude = any([re.match(exclude_pattern, param_name) for exclude_pattern in exclude_param_names_regex]) + if not exclude: + param_names_to_merge.append(param_name) + return param_names_to_merge + + +def get_model(model_path): + if "mistral" in model_path or "mixtral" in model_path: + data_type = torch.bfloat16 + else: + data_type = torch.float16 + with torch.no_grad(): + model = transformers.AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype=data_type, + low_cpu_mem_usage=True, + # device_map="auto" + ).to("cuda") + return model + + + + +def singular_values_for_variance(tensor, variances=[0.9, 0.95]): + """ + Calculate the minimum number of singular values needed to reach specified variance ratios. + + Parameters: + - tensor: A 2D tensor for which to calculate the SVD. + - variances: A list of variance ratios to calculate the minimum number of singular values for. + + Returns: + A dictionary with the variance ratios as keys and the minimum number of singular values needed as values. + """ + # Compute SVD + U, S, V = torch.svd(tensor) + # Calculate the squared singular values (proportional to variance explained) + squared_singular_values = torch.pow(S, 2) + total_variance = torch.sum(squared_singular_values) + cumulative_variance_ratios = torch.cumsum(squared_singular_values, dim=0) / total_variance + + # Find the minimum number of singular values for each specified variance + results = {} + for variance in variances: + num_singular_values = torch.searchsorted(cumulative_variance_ratios, variance) + 1 # +1 because indices start at 0 + results[variance] = num_singular_values.item() + + return results + + +def cosine_similarity_matrix(finetuned_param, pretrained_param): + finetuned_flat = finetuned_param.view(-1) + pretrained_flat = pretrained_param.view(-1) + cosine_similarity = F.cosine_similarity(finetuned_flat.unsqueeze(0), pretrained_flat.unsqueeze(0), dim=1) + return cosine_similarity.item() + + +def check_delta_properties(delta_weight): + # analysis properties for each linear weight in deltas + + # 计算矩阵的Frobenius范数(二范数) + matrix_norm = torch.norm(delta_weight, p='fro') + + # 计算矩阵的条件数 + # 矩阵的条件数(Condition Number)衡量的是矩阵求逆的数值稳定性。具体来说,它描述了原始数据的微小变化如何影响矩阵运算的结果。条件数越高,计算结果对数据的微小变化越敏感,即数值解可能不稳定;条件数越低,矩阵和其运算则越稳定。 + + # 定义 + # 对于非奇异矩阵 A,其条件数定义为矩阵 A 的范数与 A 的逆的范数的乘积: + # 其中,范数可以是任意矩阵范数,但是最常用的是2-范数(即谱范数),此时条件数可以解释为矩阵最大奇异值与最小奇异值的比值。 + cond_number = torch.linalg.cond(delta_weight) + + # 计算矩阵的秩 + rank = torch.linalg.matrix_rank(delta_weight) + + # 计算矩阵的有效秩 + rank_eff = singular_values_for_variance(delta_weight, variances=[0.9, 0.95]) + rank_90, rank_95 = rank_eff[0.9], rank_eff[0.95] + + + return matrix_norm, cond_number, rank, rank_90, rank_95 + + + + + ## First part: checkout cosine similarity in first layer FFN w1 + + # if "llama" in base_model_path: + # #weight_key = "model.layers.0.mlp.gate_proj.weight" + # tensor_base = base_model.model.layers[0].mlp.gate_proj.weight + # tensor_ft = finetuned_model.model.layers[0].mlp.gate_proj.weight + # cosine_sim = F.cosine_similarity(tensor_base, tensor_ft, dim=1) + # overall_similarity = cosine_sim.mean() + # base_model_name = base_model_path.split("/")[-1] + # finetuned_model_name = finetuned_model_path.split("/")[-1] + # overall_similarity_result = overall_similarity.item() + # print(f"Overall Cosine Similarity between {base_model_name} and {finetuned_model_name}: {overall_similarity_result}") + # ## 说明是llama模型 + # elif "Mixtral" in base_model_path: + # tensor_base = base_model.model.layers[0].block_sparse_moe.experts[0].w1.weight + # tensor_ft = base_model.model.layers[0].block_sparse_moe.experts[1].w1.weight + # cosine_sim = F.cosine_similarity(tensor_base, tensor_ft, dim=1) + # overall_similarity = cosine_sim.mean() + + + + + ## Second part: checkout delta square decline potential using scaled weight + + ## third part: checkout rank of original delta and + ## scaled calculation delta(relation between variance ratio and #singular values) + +def analysis_delta(base_model_path, finetuned_model_path): + pretrained_model = get_model(base_model_path) + finetuned_model = get_model(finetuned_model_path) + print(f"We are analysising the delta between the Pretrained model: {base_model_path} and the Finetuned model: {finetuned_model_path}") + task_vector_param_dict = {} + pretrained_param_dict = {param_name: param_value for param_name, param_value in pretrained_model.named_parameters()} + finetuned_param_dict = {param_name: param_value for param_name, param_value in finetuned_model.named_parameters()} + param_names_to_merge = get_param_names_to_merge(input_param_names=list(pretrained_param_dict.keys()), exclude_param_names_regex=[]) + + cos_sim_list = [] + norm_list = [] + cond_number_list = [] + rank_list = [] + rank_90_list = [] + rank_95_list = [] + + with torch.no_grad(): + for param_name in param_names_to_merge: + param_list = ['q_proj','k_proj','v_proj','o_proj','gate_proj','up_proj','down_proj'] + if all(char not in param_name for char in param_list): + continue + # import pdb + # pdb.set_trace() + #研究finetuned_param_dict[param_name]和pretrained_param_dict[param_name]的cosine similarity + task_vector_param_dict[param_name] = finetuned_param_dict[param_name] - pretrained_param_dict[param_name] + #check similarity + print(f"Investigating param_name: {param_name}") + cos_sim = cosine_similarity_matrix(finetuned_param_dict[param_name].float(), pretrained_param_dict[param_name].float()) + cos_sim_list.append(cos_sim) + print(f"cosine similarity between the finetuned model and pretrained model: ",cos_sim) + #研究他们差值的统计性质 + matrix_norm, cond_number, rank, rank_90, rank_95 = check_delta_properties(task_vector_param_dict[param_name].float()) + norm_list.append(matrix_norm) + cond_number_list.append(cond_number) + rank_list.append(rank) + rank_90_list.append(rank_90) + rank_95_list.append(rank_95) + print(f"Properties of Delta Weight---matrix_norm: {matrix_norm}, cond_number: {cond_number}, rank: {rank}, rank_90: {rank_90}, rank_95: {rank_95}") + + print(f"avg_cos_sim: {sum(cos_sim_list)/len(cos_sim_list)}") + print(f"avg_norm: {sum(norm_list)/len(norm_list)}") + print(f"avg_cond_number: {sum(cond_number_list)/len(cond_number_list)}") + print(f"avg_rank: {sum(rank_list)/len(rank_list)}") + print(f"avg_rank_90: {sum(rank_90_list)/len(rank_90_list)}") + print(f"avg_rank_95: {sum(rank_95_list)/len(rank_95_list)}") + + print(f"Analysis end for the pretrained model: {base_model_path} and finetuned_model: {finetuned_model_path}") + del pretrained_model + del finetuned_model + return + +moe_base = "/home/wanghanqing/projects/models/model_ver2/Mixtral-8x7B-v0.1" +instruct_base = "/home/wanghanqing/projects/models/model_ver2/Mistral-7B-Instruct-v0.2" +base_model = "/home/wanghanqing/projects/models/model_ver2/Mistral-7B-v0.1" + +code_llama13 = "/data/public/opensource_models/codellama/codellama-13b-python-hf" +wizard_coder = "/data/public/opensource_models/WizardLM/WizardCoder-Python-13B-V1.0" +llama2_7b = "/data/public/opensource_models/meta-llama/Llama-2-7b-hf" +llama2_7b_chat = "/data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf" +llama2_13b = "/data/public/opensource_models/meta-llama/Llama-2-13b-hf" +llama2_13b_chat = "/data/public/opensource_models/meta-llama/Llama-2-13b-chat-hf" +wizard_math_7b = "/data/public/opensource_models/WizardLM/WizardMath-7B-V1.0" +wizard_math_13b = "/data/public/opensource_models/WizardLM/WizardMath-13B-V1.0" +meta_math_7b = "/data/public/wangshuo/exp/ft-en-metameth-llama-2-7b/ckpts/checkpoints/epoch_2_hf" +magicoder_7b = "/data/public/wangshuo/exp/ft-en-magicoder-llama-2-7b/ckpts/checkpoints/epoch_2_hf" +magicoder_13b = "/data/public/wangshuo/exp/ft-en-magicoder-llama-2-13b/ckpts/checkpoints/epoch_2_hf" + + +# Mistral-7B +## base +mistral_7b = "/home/wanghanqing/projects/models/model_ver2/Mistral-7B-v0.1" +## finetuned +mistral_7b_instruct_v1 = "/home/wanghanqing/projects/models/model_ver2/Mistral-7B-Instruct-v0.1" +mistral_7b_instruct_v2 = "/home/wanghanqing/projects/models/model_ver2/Mistral-7B-Instruct-v0.2" + +# llama2-7b +## base +llama2_7b = "/data/public/opensource_models/meta-llama/Llama-2-7b-hf" +## finetuned +llama2_7b_chat = "/data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf" +wizard_math_7b = "/data/public/opensource_models/WizardLM/WizardMath-7B-V1.0" +meta_math_7b = "/data/public/wangshuo/exp/ft-en-metameth-llama-2-7b/ckpts/checkpoints/epoch_2_hf" +magicoder_7b = "/data/public/wangshuo/exp/ft-en-magicoder-llama-2-7b/ckpts/checkpoints/epoch_2_hf" + +# llama2-13b +## base +llama2_13b = "/data/public/opensource_models/meta-llama/Llama-2-13b-hf" +## finetuned +llama2_13b_chat = "/data/public/opensource_models/meta-llama/Llama-2-13b-chat-hf" +wizard_math_13b = "/data/public/opensource_models/WizardLM/WizardMath-13B-V1.0" +magicoder_13b = "/data/public/wangshuo/exp/ft-en-magicoder-llama-2-13b/ckpts/checkpoints/epoch_2_hf" +code_llama13 = "/data/public/opensource_models/codellama/codellama-13b-python-hf" +wizard_coder = "/data/public/opensource_models/WizardLM/WizardCoder-Python-13B-V1.0" + + + + +import sys + +# 打开一个日志文件 +log_file = open("analysis_log.txt", "w") + +# 保存原始的标准输出 +original_stdout = sys.stdout + +# 重定向标准输出到文件 +sys.stdout = log_file + +# 你的代码,所有print函数的输出都会写入log.txt +print("This will be written to analysis_log.txt") + + + + + +analysis_delta(base_model_path = llama2_7b, finetuned_model_path = llama2_7b_chat) +analysis_delta(base_model_path = llama2_7b, finetuned_model_path = wizard_math_7b) +analysis_delta(base_model_path = llama2_7b, finetuned_model_path = meta_math_7b) +analysis_delta(base_model_path = llama2_7b, finetuned_model_path = magicoder_7b) + +# 恢复原始的标准输出 +sys.stdout = original_stdout + +# 关闭日志文件 +log_file.close() \ No newline at end of file diff --git a/eval.py b/eval.py new file mode 100644 index 0000000..5e813b1 --- /dev/null +++ b/eval.py @@ -0,0 +1,30 @@ +import argparse +import transformers +import torch +from transformers import AutoConfig, AutoModelForCausalLM + +def load_model(model_name): + model = transformers.AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True,) + return model + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--base_model', type=str) + parser.add_argument('--finetuned_model', type=str) + args = parser.parse_args() + + base_model = load_model(args.base_model) + finetuned_model = load_model(args.finetuned_model) + + params = dict() + + for n,p in finetuned_model.named_parameters(): + if "mlp" in n or "self_attn" in n: + delta = p - base_model.state_dict()[n] + w = torch.sum(torch.abs(delta)) + params[n] = w.item() + + print(params) \ No newline at end of file diff --git a/lowbit_lowrank.py b/lowbit_lowrank.py new file mode 100644 index 0000000..4f159ff --- /dev/null +++ b/lowbit_lowrank.py @@ -0,0 +1,23 @@ +import os + +import torch + +import torch.nn.functional as F +from bitdelta.diff import compress_diff, save_diff, save_full_model +from bitdelta.misc import find_corr_stddev + +from bitdelta.utils import get_model, parse_args, get_tokenizer +from tqdm import tqdm + +args = parse_args() + +tokenizer = get_tokenizer(args.base_model) + +with torch.no_grad(): + base_model = get_model(args.base_model, args.base_model_device, args.base_model_memory_map) + finetuned_model = get_model(args.finetuned_model, args.finetuned_model_device, args.finetuned_model_memory_map) + +finetuned_compressed_model = get_model(args.finetuned_model, args.finetuned_compressed_model_device, args.finetuned_compressed_model_memory_map) + +print(f"compressing diff...") +compress_diff(base_model, finetuned_model, finetuned_compressed_model) diff --git a/run.sh b/run.sh new file mode 100644 index 0000000..37ed843 --- /dev/null +++ b/run.sh @@ -0,0 +1,45 @@ +MODEL_SAVE_DIR=/home/pingbowen/workspace/delta-compression/save/mistral-v0.2_bitdelta + +mkdir -p $MODEL_SAVE_DIR + +values=(0.05 0.2 0.4 0.5 0.75) + +pretrained_model=(/data/public/opensource_models/meta-llama/Llama-2-7b-hf/ ) +finetuned_model=(/data/public/wangshuo/exp/ft-en-magicoder-llama-2-7b/ckpts/checkpoints/epoch_2_hf +) +svd_dict=(/data/groups/QY_LLM_Other/pingbowen/models/codelora/codelora_svd.pt / ) +save_dir=(/home/pingbowen/workspace/delta-compression/save/test /data/groups/QY_LLM_Other/pingbowen/models/codelora/codelora_bitdelta/) + +for (( i=0; i<2; i++ )); do + +# choice="svd" +if [ $i -eq 0 ]; then + choice="svd" +else + choice="bit" +fi + +gpu0=$((2 * i)) +gpu1=$((2 * i + 1)) +# "$gpu0,$gpu1" +CUDA_VISIBLE_DEVICES=$((i + 1)) python \ + bitdelta/train2.py \ + --base_model ${pretrained_model[0]} \ + --finetuned_model ${finetuned_model[0]} \ + --save_dir ${save_dir[$i]} \ + --batch_size 4 \ + --num_steps 200 \ + --save_full_model True \ + --attn_outlier 0.2 \ + --mlp_outlier 0.1 \ + --svd_dict ${svd_dict[$i]} \ + --dim 1024 \ + --scale_factor 1.46 \ + --choice $choice & + # &> test.log # ${save_dir[$i]} +done +wait + # /data/public/opensource_models/codellama/codellama-7b-python-hf/ /data/groups/QY_LLM_Other/OSS_Code_LLM/Magicoder-S-CL-7B/ + # /home/pingbowen/models/vicuna-13b-v1.5 , /home/pingbowen/models/Llava-v1.5 + # /data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/ + # /data/public/opensource_models/meta-llama/Llama-2-7b-hf/ /data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/ diff --git a/run_tailor.sh b/run_tailor.sh new file mode 100644 index 0000000..a3ff799 --- /dev/null +++ b/run_tailor.sh @@ -0,0 +1,31 @@ +pretrained_model=(/data/public/opensource_models/meta-llama/Llama-2-7b-hf/ /data/public/opensource_models/meta-llama/Llama-2-7b-hf/ /data/public/opensource_models/codellama/codellama-7b-python-hf/ /home/pingbowen/models/vicuna-7b-v1.5) +finetuned_model=(/data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/ /data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/ /data/groups/QY_LLM_Other/OSS_Code_LLM/Magicoder-S-CL-7B/ /home/pingbowen/models/llava-v1.5-7b) +finetuned_compressed_model=(/home/pingbowen/workspace/delta-compression/saved_model/WizardMath-7B-V1.0_bitdelta/ /data/groups/QY_LLM_Other/pingbowen/models/wizardmath/WizardMath_svd/ /data/groups/QY_LLM_Other/pingbowen/models/wizardmath/delta_1024_mix_32_8_3_2_full/) +param_types=(q_proj k_proj v_proj o_proj gate_proj up_proj down_proj) +model_types=(svd bitdelta mix) + +for (( j=0; j<${#param_types[@]}; j++ )); do + CUDA_VISIBLE_DEVICES=1 python3 Plot.py --param_type ${param_types[$j]} +done +# for i in {0..2} +# do +# for (( j=0; j<${#param_types[@]}; j++ )); do +# CUDA_VISIBLE_DEVICES=1,7 python tailor.py \ +# --pretrained_model_name ${pretrained_model[0]} \ +# --finetuned_model_name ${finetuned_model[0]} \ +# --finetuned_compressed_model ${finetuned_compressed_model[$i]} \ +# --dim 128 \ +# --scale_factor 1.45 \ +# --param_type ${param_types[$j]} \ +# --model_type ${model_types[$i]} \ +# --save_dir ./statistic/ +# done +# done + + +# & +# /data/public/opensource_models/codellama/codellama-7b-python-hf/ +# /data/groups/QY_LLM_Other/OSS_Code_LLM/Magicoder-S-CL-7B/ +# python3 tailor.py \ +# --finetuned_model_name /data/public/wangshuo/exp/ft-en-metameth-llama-2-7b/ckpts/checkpoints/epoch_2_hf \ +# --save_dir /home/pingbowen/workspace/delta-compression/BitDelta/tailor_model/math_lora_7b \ \ No newline at end of file diff --git a/scripts/ppl_eval_example.bash b/scripts/ppl_eval_example.bash index ee6cc6a..ba45351 100644 --- a/scripts/ppl_eval_example.bash +++ b/scripts/ppl_eval_example.bash @@ -1,8 +1,10 @@ +PPL_SAVE_DIR=save + CUDA_VISIBLE_DEVICES=0 python \ bitdelta/eval_ppl.py \ - --base_model meta-llama/Llama-2-7b-hf \ + --base_model /home/pingbowen/workspace/delta-compression/BitDelta/save/calibrated_model \ --dataset_name wikitext \ --subset wikitext-2-raw-v1 \ --save_dir $PPL_SAVE_DIR \ --num_eval_samples 100 \ - --model_diff $MODEL_SAVE_DIR/diff.pt \ \ No newline at end of file + # --model_diff $MODEL_SAVE_DIR/diff.pt \ \ No newline at end of file diff --git a/tailor.py b/tailor.py new file mode 100755 index 0000000..d1f7f88 --- /dev/null +++ b/tailor.py @@ -0,0 +1,93 @@ +import argparse +import jsonlines +import sys +import shutil +import logging +import os +import time +from tqdm import tqdm +import glob +import json +import torch +import datasets +from transformers import AutoTokenizer, AutoModelForCausalLM +# from vllm import LLM, SamplingParams +import re +import random +import numpy as np +import torch.nn.functional as F + +parser = argparse.ArgumentParser() +parser.add_argument('--pretrained_model_name', type=str, help='pretrained model name') +parser.add_argument('--finetuned_model_name', type=str, help='finetuned model name') +parser.add_argument('--finetuned_compressed_model', type=str, help='finetuned model name') +parser.add_argument('--save_dir', type=str, help='finetuned model name') +parser.add_argument('--param_type', type=str, help='finetuned model name') +parser.add_argument('--model_type', type=str, help='finetuned model name') +parser.add_argument('--dim', type=int, help='finetuned model name') +parser.add_argument('--scale_factor', type=float, default=1.45, help='finetuned model name') +args = parser.parse_args() + +device = "cuda" if torch.cuda.is_available() else "cpu" + +pretrained_model_name = args.pretrained_model_name + +finetuned_model_name = args.finetuned_model_name +pretrained_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=pretrained_model_name,torch_dtype=torch.bfloat16).to(device) +pretrained_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=pretrained_model_name) + +finetuned_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=finetuned_model_name,torch_dtype=torch.bfloat16).to(device) +finetuned_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=finetuned_model_name) + +finetuned_compressed_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=args.finetuned_compressed_model,torch_dtype=torch.bfloat16).to(device) +def set_random_seed(seed: int = 0): + """ + set random seed + :param seed: int, random seed + :return: + """ + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + +set_random_seed(seed=0) +scale_factor = args.scale_factor + +def decomposition(masked_input_tensor,dim): + + U , S , V = torch.svd(masked_input_tensor.to(torch.float32)) + U , S , V = U[:, :dim],S[:dim],V[:, :dim] + # return torch.mm(U, torch.diag(S)), V.t() + return U @ torch.diag(S) @ V.t() #return lora_B, lora_A + +L2_norm_total,L1_norm_total, mag, num = 0, 0 , 0 ,0 + +l2_norm ,cos_sim = [],[] + +with torch.no_grad(): + for k,v in finetuned_compressed_model.state_dict().items(): + dim = args.dim + if args.param_type in k : # or "mlp" in k + # if "mlp" in k: + # dim = int(dim * scale_factor) + p = pretrained_model.get_submodule(k.replace(".weight", "")).weight + f = finetuned_model.get_submodule(k.replace(".weight", "")).weight + delta , compressed_delta = f - p, v - p + # l2_norm.append(torch.norm(delta - compressed_delta,2).item()) + cos_sim.append(torch.mean(F.cosine_similarity(delta, compressed_delta, dim=0),dim=0).item()) + # L2_norm_total ,L1_norm_total,cos_sim, mag = L2_norm_total + torch.norm(torch.abs(delta) - torch.abs(compressed_delta),2).data, L1_norm_total + torch.norm(torch.abs(delta) - torch.abs(compressed_delta),1).data, cos_sim + F.cosine_similarity(delta, compressed_delta, dim=0), mag + torch.sum(torch.abs(compressed_delta)).data + # num += 1 + + +print("cos_sim:", cos_sim) +torch.save(cos_sim, os.path.join(args.save_dir, f"{args.param_type}_{args.model_type}_cos_sim.pt")) +# print("cos_sim_ave:", cos_sim / num) +# print("mag_ave:", mag_ave) +# finetuned_model.save_pretrained(save_directory=args.save_dir) +# finetuned_tokenizer.save_pretrained(save_directory=args.save_dir) + +print("--end--") diff --git a/test.py b/test.py new file mode 100644 index 0000000..55b59a9 --- /dev/null +++ b/test.py @@ -0,0 +1,170 @@ +import argparse +import transformers +import os +os.environ["CUDA_VISIBLE_DEVICES"] = "7" +import torch +from transformers import AutoConfig, AutoModelForCausalLM +from accelerate import infer_auto_device_map, init_empty_weights +import torch.nn as nn +import torch.nn.functional as F +# from llava.model.language_model.llava_llama import LlavaConfig +from transformers import AutoTokenizer, AutoModelForCausalLM +# from llava.model import * + +def get_tokenizer(tokenizer_name): + tokenizer = transformers.AutoTokenizer.from_pretrained( + tokenizer_name, use_fast=False, + ) + + if tokenizer.pad_token_id is None: + if tokenizer.eos_token_id is not None: + tokenizer.pad_token_id = tokenizer.eos_token_id + else: + tokenizer.pad_token_id = 0 + + return tokenizer + +@torch.no_grad() +def load_diff(model, diff_dir): + device = model.device + diff_dict = torch.load(diff_dir) + + for name, module in model.named_modules(): + if name + ".mask" in diff_dict: + coeff = diff_dict[name + ".coeff"].to(device) + mask = diff_dict[name + ".mask"].to(device) + + setattr(module, "mask", mask) + setattr(module, "coeff", coeff) + # module.weight.add_((mask * coeff).to(module.weight.dtype)) + elif name + ".weight" in diff_dict: + module.weight = nn.Parameter(diff_dict[name + ".weight"].to(device).to(module.weight.dtype)) + + elif name + '.A' in diff_dict: + A = diff_dict[name + '.A'].to(device) + B = diff_dict[name + '.B'].to(device) + + mask = (A @ B).T + module.weight.add_(mask.to(module.weight.dtype)) + + model.config.vocab_size = model.lm_head.weight.size(0) + + +def get_model(model_name, device, memory_map=None): + # multi-gpu + if device == "auto" or isinstance(device, list): + + # if gpus are specified, distributes according to the memory map + if isinstance(device, list): + assert memory_map is not None, "memory_map must be specified when using multiple gpus" + config = AutoConfig.from_pretrained(model_name) + with init_empty_weights(): + model = AutoModelForCausalLM.from_config(config) + + device_map = infer_auto_device_map(model, memory_map, no_split_module_classes=["LlamaDecoderLayer"]) + + else: + # use all available gpus + device_map = "auto" + + return transformers.AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype=torch.bfloat16, + device_map=device_map, + ) + else: # single-gpu or cpu + return transformers.AutoModelForCausalLM.from_pretrained( + model_name, + # torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, + ) + + +def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, device): + base_model = get_model(base_model_name, device) + tokenizer = get_tokenizer(finetuned_model_name) + load_diff(base_model, diff_dir) + + base_model.save_pretrained(save_dir) + tokenizer.save_pretrained(save_dir) + + del base_model + + +device = "cuda" if torch.cuda.is_available() else "cpu" + +a = torch.rand(4096) / 1000 +b = torch.rand(4096) / 1000 + +# a , b = a.to(torch.bfloat16) , b.to(torch.bfloat16) + +dot_fp , dot_pp = torch.dot(a, b) , torch.dot(b, b) + +x = dot_fp / dot_pp + +cosine_sim = F.cosine_similarity(a,b,dim=0) + +cosine_sim2 = F.cosine_similarity(b,a - x * b,dim=0) + +def filter_top_and_bottom_percent(tensor, percent=0.5): + # 计算保留的元素数量 + num_elements = tensor.numel() + num_to_keep = int(num_elements * percent / 100) + + # 展平张量并获取最大和最小的元素的索引 + flat_tensor = tensor.flatten() + _, top_indices = torch.topk(flat_tensor, num_to_keep, largest=True) + _, bottom_indices = torch.topk(flat_tensor, num_to_keep, largest=False) + + # 创建一个全零张量 + result = torch.zeros_like(tensor) + + # 仅在指定位置放置最大和最小的元素 + result = result.flatten() + result[top_indices] = flat_tensor[top_indices] + result[bottom_indices] = flat_tensor[bottom_indices] + result = result.reshape(tensor.shape) + + return result + +def copy_nonzero_values(A, B): + # 复制B中非零值到A的对应位置 + mask = B != 0 + A[mask] = B[mask] + return A + +def load_svd(model): + param_dict = torch.load(args.svd_dict) + # import pdb; pdb.set_trace() + with torch.no_grad(): + for k,v in param_dict.items(): + if "base" in k: + dim = args.dim + + if "mlp" in k: + dim = int(dim * args.scale_factor) + + k = k.replace(".base", "") + + U = param_dict[k + ".U"][:, :dim] + S = param_dict[k + ".S"][:dim] + V = param_dict[k + ".V"][:, :dim] + # import pdb; pdb.set_trace() + model.get_submodule(k).weight.copy_(v + U @ torch.diag(S) @ V.t()) + +parser = argparse.ArgumentParser(description="BitDelta") +parser.add_argument("--dim", type=int, default=128) +parser.add_argument("--scale_factor", type=float, default=1.45) +parser.add_argument("--svd_dict", type=str, default="") +args = parser.parse_args() + +tokenizer = AutoTokenizer.from_pretrained("/data/public/wangshuo/exp/ft-en-metameth-llama-2-7b/ckpts/checkpoints/epoch_2_hf") +model = AutoModelForCausalLM.from_pretrained("/data/public/wangshuo/exp/ft-en-metameth-llama-2-7b/ckpts/checkpoints/epoch_2_hf", torch_dtype=torch.bfloat16) # low_cpu_mem_usage=True + +load_svd(model) + +tokenizer.save_pretrained(f"/data/groups/QY_LLM_Other/pingbowen/models/mathlora/math_svd/") +model.save_pretrained(f"/data/groups/QY_LLM_Other/pingbowen/models/mathlora/math_svd/") + +# get_tokenizer("/data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/") +# save_full_model("/data/public/opensource_models/meta-llama/Llama-2-7b-hf/", "/data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/", os.path.join("/home/pingbowen/workspace/delta-compression/BitDelta/save", "diff_untrained.pt"), os.path.join("/home/pingbowen/workspace/delta-compression/BitDelta/save", "uncalibrated_model"), device="cuda") \ No newline at end of file