From 3a3c7688825736159cab6e554c5b3ab2a6b00ff5 Mon Sep 17 00:00:00 2001 From: pingbowen Date: Mon, 4 Mar 2024 18:48:20 +0800 Subject: [PATCH 01/14] lowrank tailor --- bitdelta/diff.py | 61 ++++++++++- bitdelta/diff.py.rej | 38 +++++++ bitdelta/train.py | 57 ++++++----- bitdelta/train2.py | 37 +++++++ bitdelta/utils.py | 5 +- eval.py | 30 ++++++ lowbit_lowrank.py | 23 +++++ run.sh | 15 +++ scripts/ppl_eval_example.bash | 6 +- tailor.py | 186 ++++++++++++++++++++++++++++++++++ test.py | 110 ++++++++++++++++++++ 11 files changed, 532 insertions(+), 36 deletions(-) create mode 100644 bitdelta/diff.py.rej create mode 100644 bitdelta/train2.py create mode 100644 eval.py create mode 100644 lowbit_lowrank.py create mode 100644 run.sh create mode 100755 tailor.py create mode 100644 test.py diff --git a/bitdelta/diff.py b/bitdelta/diff.py index c2b03ce..faa1bbb 100644 --- a/bitdelta/diff.py +++ b/bitdelta/diff.py @@ -9,12 +9,13 @@ class BinaryDiff(nn.Module): def __init__(self, base, finetune): super().__init__() diff = finetune - base + # diff = decomposition(diff, 2048) quantile = diff.float().abs().mean() mask = torch.ones_like(diff) mask[diff < 0] = 0 mask = pack(mask.bool().T) - + self.register_buffer("mask", mask) self.register_buffer("base", base.T) self.register_parameter( @@ -38,7 +39,15 @@ def forward(self, x): repeated_mask = self.mask.unsqueeze(0).repeat(x.size(0), 1, 1) return x @ self.base + self.coeff * binary_bmm(x, repeated_mask) -def compress_diff(base_model, finetuned_model, finetuned_compressed_model): +def Pass(layers=None,name=None): + if layers is not None: + for layer in layers: + if layer in name: + return True + return False + + +def compress_diff(base_model, finetuned_model, finetuned_compressed_model,layers=None): def compress_submodule(name, subname, module, submodule): target_device = submodule.weight.device @@ -59,11 +68,15 @@ def compress_submodule(name, subname, module, submodule): # TODO: this can be parallelized for name, module in finetuned_compressed_model.named_modules(): if "mlp" in name or "self_attn" in name: + + if Pass(layers,name) == True: + continue + for subname, submodule in module.named_children(): if "proj" in subname: compress_submodule(name, subname, module, submodule) -def save_diff(finetuned_compressed_model, save_dir): +def save_diff(finetuned_compressed_model, save_dir,layers=None): diff_dict = {} for name, module in finetuned_compressed_model.named_modules(): @@ -91,6 +104,9 @@ def load_diff(model, diff_dir): # setattr(module, "mask", mask) # setattr(module, "coeff", coeff) weight = (unpack(mask)*2-1) * coeff + + if "mlp" in name: + weight = decomposition(weight, 1024) module.weight.add_(weight.T.to(module.weight.dtype)) elif name + ".weight" in diff_dict: @@ -105,11 +121,46 @@ def load_diff(model, diff_dir): model.config.vocab_size = model.lm_head.weight.size(0) -def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, device): +def decomposition(masked_input_tensor,dim): + # if "mlp" in name: + # dim = int(dim * 1.45) + + U , S , V = torch.svd(masked_input_tensor) + # total_sum , partial_sum = torch.sum(S) , torch.sum(S[:128]) + # import pdb; pdb.set_trace() + U , S , V = U[:, :dim],S[:dim] ,V[:, :dim] + return torch.mm(torch.mm(U, torch.diag(S)), V.t()) + +def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, device,layers=None): base_model = get_model(base_model_name, device) tokenizer = get_tokenizer(finetuned_model_name) + + finetuned_model = get_model(finetuned_model_name, device) + # params = {} + + # for k ,v in finetuned_model.named_parameters(): + # if layers is not None: + # for layer in layers: + # if layer in k: + # if "mlp" in k or "self_attn" in k: + # delta = v.detach().cpu() - base_model.get_submodule(k.replace('.weight',"")).weight.detach().cpu() + # dim = 128 + # if "mlp" in k: + # dim = int(dim * 1.45) + # # import pdb; pdb.set_trace() + # params[k] = decomposition(delta.to(torch.float32), dim).to(torch.bfloat16) + + # import pdb; pdb.set_trace() + # dict(base_model.named_parameters())['model.layers.0.self_attn.o_proj.weight'] + + # with torch.no_grad(): + # for param in params: + # base_model.get_submodule(param.replace('.weight',"")).weight.add_(params[param].detach().to(device)) + + # import pdb; pdb.set_trace() load_diff(base_model, diff_dir) - + + base_model.save_pretrained(save_dir) tokenizer.save_pretrained(save_dir) diff --git a/bitdelta/diff.py.rej b/bitdelta/diff.py.rej new file mode 100644 index 0000000..5f60f5f --- /dev/null +++ b/bitdelta/diff.py.rej @@ -0,0 +1,38 @@ +diff a/bitdelta/diff.py b/bitdelta/diff.py (rejected hunks) +@@ -73,24 +86,31 @@ def save_diff(finetuned_compressed_model, save_dir): + diff_dict[name + ".coeff"] = module.coeff.cpu() + + for name, param in finetuned_compressed_model.named_parameters(): ++ if "mlp" in name or "self_attn" in name: ++ if Pass(layers,name) == True: ++ continue ++ + if param.requires_grad: + diff_dict[name] = param.cpu() +- ++ ++ # import pdb; pdb.set_trace() + torch.save(diff_dict, save_dir) + + @torch.no_grad() + def load_diff(model, diff_dir): + device = model.device + diff_dict = torch.load(diff_dir) +- ++ + for name, module in model.named_modules(): + if name + ".mask" in diff_dict: + coeff = diff_dict[name + ".coeff"].to(device) + mask = diff_dict[name + ".mask"].to(device) + +- setattr(module, "mask", mask) +- setattr(module, "coeff", coeff) +- # module.weight.add_((mask * coeff).to(module.weight.dtype)) ++ # setattr(module, "mask", mask) ++ # setattr(module, "coeff", coeff) ++ weight = (unpack(mask)*2-1) * coeff ++ ++ module.weight.add_(weight.T.to(module.weight.dtype)) + elif name + ".weight" in diff_dict: + module.weight = nn.Parameter(diff_dict[name + ".weight"].to(device).to(module.weight.dtype)) + diff --git a/bitdelta/train.py b/bitdelta/train.py index 946dafb..6ab3825 100644 --- a/bitdelta/train.py +++ b/bitdelta/train.py @@ -37,7 +37,7 @@ finetuned_compressed_model = get_model(args.finetuned_model, args.finetuned_compressed_model_device, args.finetuned_compressed_model_memory_map) print(f"compressing diff...") -compress_diff(base_model, finetuned_model, finetuned_compressed_model) +compress_diff(base_model, finetuned_model, finetuned_compressed_model,layers=args.layers) train_num_samples = args.batch_size * args.num_steps train_dataset = get_dataset( @@ -55,37 +55,38 @@ ) # save untrained delta -save_diff(finetuned_compressed_model, os.path.join(args.save_dir, "diff_untrained.pt")) +save_diff(finetuned_compressed_model, os.path.join(args.save_dir, "diff_untrained.pt"),layers=args.layers) -optimizer = torch.optim.AdamW(finetuned_compressed_model.parameters(), lr=args.lr) -scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.num_steps) +if args.train: + optimizer = torch.optim.AdamW(finetuned_compressed_model.parameters(), lr=args.lr) + scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.num_steps) -bar = tqdm(train_dataloader) + bar = tqdm(train_dataloader) -train_loss_list = [] + train_loss_list = [] -# Train loop -for step, batch in enumerate(bar): - batch1 = {k: v.to(finetuned_model.device) for k, v in batch.items()} - with torch.inference_mode(): - finetuned_outputs = finetuned_model(**batch1) + # Train loop + for step, batch in enumerate(bar): + batch1 = {k: v.to(finetuned_model.device) for k, v in batch.items()} + with torch.inference_mode(): + finetuned_outputs = finetuned_model(**batch1) - batch2 = {k: v.to(finetuned_compressed_model.device) for k, v in batch.items()} - finetuned_compressed_outputs = finetuned_compressed_model(**batch2) + batch2 = {k: v.to(finetuned_compressed_model.device) for k, v in batch.items()} + finetuned_compressed_outputs = finetuned_compressed_model(**batch2) - loss = F.mse_loss( - finetuned_outputs.logits.clone().to(finetuned_compressed_outputs.logits.device), - finetuned_compressed_outputs.logits, - ) + loss = F.mse_loss( + finetuned_outputs.logits.clone().to(finetuned_compressed_outputs.logits.device), + finetuned_compressed_outputs.logits, + ) - train_loss_list.append(loss.item()) + train_loss_list.append(loss.item()) - optimizer.zero_grad() - loss.backward() - optimizer.step() - scheduler.step() + optimizer.zero_grad() + loss.backward() + optimizer.step() + scheduler.step() - bar.set_description(f"train loss: {loss.item()}") + bar.set_description(f"train loss: {loss.item()}") # save loss list @@ -93,14 +94,14 @@ with open(os.path.join(args.save_dir, f"train_loss_{args.num_groups}.json"), "w") as f: json.dump(train_loss_list, f) -# save trained delta -save_diff(finetuned_compressed_model, os.path.join(args.save_dir, "diff.pt")) +# # save trained delta +save_diff(finetuned_compressed_model, os.path.join(args.save_dir, "diff.pt"),layers=args.layers) del base_model, finetuned_model, finetuned_compressed_model torch.cuda.empty_cache() if args.save_full_model: print("saving uncalibrated model") - save_full_model(args.base_model, args.finetuned_model, os.path.join(args.save_dir, "diff_untrained.pt"), os.path.join(args.save_dir, "uncalibrated_model"), device="cpu") - print("saving calibrated model") - save_full_model(args.base_model, args.finetuned_model, os.path.join(args.save_dir, "diff.pt"), os.path.join(args.save_dir, "calibrated_model"), device="cpu") + save_full_model(args.base_model, args.finetuned_model, os.path.join(args.save_dir, "diff_untrained.pt"), os.path.join(args.save_dir, f"uncalibrated_model"), device="cpu",layers=args.layers) + # print("saving calibrated model") + # save_full_model(args.base_model, args.finetuned_model, os.path.join(args.save_dir, "diff.pt"), os.path.join(args.save_dir, "calibrated_model"), device="cpu") diff --git a/bitdelta/train2.py b/bitdelta/train2.py new file mode 100644 index 0000000..37c9c70 --- /dev/null +++ b/bitdelta/train2.py @@ -0,0 +1,37 @@ +import os + +import torch + +import torch.nn.functional as F +from bitdelta.diff import compress_diff, save_diff, save_full_model +from bitdelta.misc import find_corr_stddev + +from bitdelta.utils import get_model, parse_args, get_tokenizer +from tqdm import tqdm +from bitdelta.data import get_dataset, get_dataloader + +import json + +args = parse_args() + +# create save_dir if it doesn't exist +os.makedirs(args.save_dir, exist_ok=True) + +tokenizer = get_tokenizer(args.base_model) + +with torch.no_grad(): + base_model = get_model(args.base_model, args.base_model_device, args.base_model_memory_map) + finetuned_model = get_model(args.finetuned_model, args.finetuned_model_device, args.finetuned_model_memory_map) + +finetuned_compressed_model = get_model(args.finetuned_model, args.finetuned_compressed_model_device, args.finetuned_compressed_model_memory_map) + +print(f"compressing diff...") +compress_diff(base_model, finetuned_model, finetuned_compressed_model) + +# save untrained delta +save_diff(finetuned_compressed_model, os.path.join(args.save_dir, "diff_untrained.pt")) + + +if args.save_full_model: + print("saving uncalibrated model") + save_full_model(args.base_model, args.finetuned_model, os.path.join(args.save_dir, "diff_untrained.pt"), os.path.join(args.save_dir, "uncalibrated_model"), device="cpu") diff --git a/bitdelta/utils.py b/bitdelta/utils.py index a7c55ea..1304239 100644 --- a/bitdelta/utils.py +++ b/bitdelta/utils.py @@ -21,9 +21,11 @@ def parse_args(): parser.add_argument("--lr", type=float, default=1e-4) parser.add_argument("--num_steps", type=int, default=100) parser.add_argument("--batch_size", type=int, default=4) + parser.add_argument("--layers", nargs='+', default=None) + parser.add_argument("--save_num", type=int, default=0) parser.add_argument("--max_length", type=int, default=128) parser.add_argument("--save_dir", type=str, required=True) - + parser.add_argument("--train", action="store_true") # device management parser.add_argument("--base_model_device", type=str, default="0") @@ -102,6 +104,7 @@ def get_model(model_name, device, memory_map=None): else: # single-gpu or cpu return transformers.AutoModelForCausalLM.from_pretrained( model_name, + # torch_dtype=torch.float16, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, ).to(device) diff --git a/eval.py b/eval.py new file mode 100644 index 0000000..5e813b1 --- /dev/null +++ b/eval.py @@ -0,0 +1,30 @@ +import argparse +import transformers +import torch +from transformers import AutoConfig, AutoModelForCausalLM + +def load_model(model_name): + model = transformers.AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True,) + return model + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--base_model', type=str) + parser.add_argument('--finetuned_model', type=str) + args = parser.parse_args() + + base_model = load_model(args.base_model) + finetuned_model = load_model(args.finetuned_model) + + params = dict() + + for n,p in finetuned_model.named_parameters(): + if "mlp" in n or "self_attn" in n: + delta = p - base_model.state_dict()[n] + w = torch.sum(torch.abs(delta)) + params[n] = w.item() + + print(params) \ No newline at end of file diff --git a/lowbit_lowrank.py b/lowbit_lowrank.py new file mode 100644 index 0000000..4f159ff --- /dev/null +++ b/lowbit_lowrank.py @@ -0,0 +1,23 @@ +import os + +import torch + +import torch.nn.functional as F +from bitdelta.diff import compress_diff, save_diff, save_full_model +from bitdelta.misc import find_corr_stddev + +from bitdelta.utils import get_model, parse_args, get_tokenizer +from tqdm import tqdm + +args = parse_args() + +tokenizer = get_tokenizer(args.base_model) + +with torch.no_grad(): + base_model = get_model(args.base_model, args.base_model_device, args.base_model_memory_map) + finetuned_model = get_model(args.finetuned_model, args.finetuned_model_device, args.finetuned_model_memory_map) + +finetuned_compressed_model = get_model(args.finetuned_model, args.finetuned_compressed_model_device, args.finetuned_compressed_model_memory_map) + +print(f"compressing diff...") +compress_diff(base_model, finetuned_model, finetuned_compressed_model) diff --git a/run.sh b/run.sh new file mode 100644 index 0000000..ef3b529 --- /dev/null +++ b/run.sh @@ -0,0 +1,15 @@ +MODEL_SAVE_DIR=save/ + +mkdir -p $MODEL_SAVE_DIR + +CUDA_VISIBLE_DEVICES=6,7 python \ + bitdelta/train.py \ + --base_model /data/public/opensource_models/meta-llama/Llama-2-7b-hf/ \ + --finetuned_model /data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/ \ + --save_dir $MODEL_SAVE_DIR \ + --batch_size 4 \ + --num_steps 200 \ + --save_full_model True + + # --layers "layers.5."\ + # /data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/ diff --git a/scripts/ppl_eval_example.bash b/scripts/ppl_eval_example.bash index ee6cc6a..ba45351 100644 --- a/scripts/ppl_eval_example.bash +++ b/scripts/ppl_eval_example.bash @@ -1,8 +1,10 @@ +PPL_SAVE_DIR=save + CUDA_VISIBLE_DEVICES=0 python \ bitdelta/eval_ppl.py \ - --base_model meta-llama/Llama-2-7b-hf \ + --base_model /home/pingbowen/workspace/delta-compression/BitDelta/save/calibrated_model \ --dataset_name wikitext \ --subset wikitext-2-raw-v1 \ --save_dir $PPL_SAVE_DIR \ --num_eval_samples 100 \ - --model_diff $MODEL_SAVE_DIR/diff.pt \ \ No newline at end of file + # --model_diff $MODEL_SAVE_DIR/diff.pt \ \ No newline at end of file diff --git a/tailor.py b/tailor.py new file mode 100755 index 0000000..cf3143e --- /dev/null +++ b/tailor.py @@ -0,0 +1,186 @@ +import argparse +import jsonlines +import sys +import shutil +import logging +import os +import time +from tqdm import tqdm +import glob +import json +import torch +import datasets +from transformers import AutoTokenizer, AutoModelForCausalLM +# from vllm import LLM, SamplingParams +import re +import random +import numpy as np + +pretrained_model_name = "/data/public/opensource_models/meta-llama/Llama-2-7b-hf" + +finetuned_model_name = "/data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf" # /data/public/wangshuo/exp/ft-en-magicoder-llama-2-7b/ckpts/checkpoints/epoch_2_hf + +pretrained_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=pretrained_model_name, + device_map="cpu") +pretrained_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=pretrained_model_name) +finetuned_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=finetuned_model_name, + device_map="cpu") +finetuned_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=finetuned_model_name) + +save_dir = "/home/pingbowen/workspace/delta-compression/BitDelta/save/uncalibrated_model" + +def set_random_seed(seed: int = 0): + """ + set random seed + :param seed: int, random seed + :return: + """ + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + +set_random_seed(seed=0) +# scale_factor = finetuned_model.config.intermediate_size / finetuned_model.config.hidden_size + + +def get_param_names_to_merge(input_param_names: list, exclude_param_names_regex: list): + """ + get the names of parameters that need to be merged + :param input_param_names: list, names of input parameters + :param exclude_param_names_regex: list, regular expression of names of parameters that need to be excluded + :return: + """ + param_names_to_merge = [] + for param_name in input_param_names: + exclude = any([re.match(exclude_pattern, param_name) for exclude_pattern in exclude_param_names_regex]) + if not exclude: + param_names_to_merge.append(param_name) + return param_names_to_merge + + + +task_vector_param_dict = {} +pretrained_param_dict = {param_name: param_value for param_name, param_value in pretrained_model.named_parameters()} +finetuned_param_dict = {param_name: param_value for param_name, param_value in finetuned_model.named_parameters()} +# param_names_to_merge = get_param_names_to_merge(input_param_names=list(pretrained_param_dict.keys()), exclude_param_names_regex=[]) +# with torch.no_grad(): +# for param_name in finetuned_param_dict.keys(): +# task_vector_param_dict[param_name] = finetuned_param_dict[param_name] - pretrained_param_dict[param_name] +# print(f"name {param_name} data {task_vector_param_dict[param_name]} ") + + +# import pdb +# pdb.set_trace() + +def decomposition(masked_input_tensor,dim): + + U , S , V = torch.svd(masked_input_tensor) + U , S , V = U[:, :dim],S[:dim],V[:, :dim] + # return torch.mm(U, torch.diag(S)), V.t() + # return U, torch.mm(torch.diag(S), V.t()) #return lora_B, lora_A + return torch.mm(torch.mm(U, torch.diag(S)), V.t()) + +# dim = 256 +dim = 128 +# dim = 16 +print("----------------------dim: ",dim) +print("----------------------dim: ",dim) +print("----------------------dim: ",dim) +print("----------------------dim: ",dim) +print("----------------------dim: ",dim) +print("----------------------dim: ",dim) + +peft_dict = {} +malign_dict = {} +other_dict = {} + +# finetuned_param_dict +# for param_name, param_value in tqdm(pretrained_param_dict.items()): +# if "self_attn" in param_name or "mlp" in param_name: +# pass +# else: +# other_dict[param_name] = param_value.contiguous() + +diff = dict() + +for param_name, param_value in tqdm(finetuned_param_dict.items()): + if "self_attn" in param_name or "mlp" in param_name: + delta = param_value - pretrained_param_dict[param_name] + if "mlp" in param_name: + dim = int(dim * 1.45) + delta = decomposition(delta,dim=dim) + diff[param_name] = (pretrained_param_dict[param_name] + delta).contiguous() + else: + diff[param_name] = param_value.contiguous() + # lora_A = lora_A * (dim/16) ###补偿scaling, 以后的alpha可以统一为16 + # peft_key = "base_model.model." + param_name.split(".weight")[0] + # print(peft_key+".lora_A.weight") + # peft_dict[peft_key+".lora_A.weight"] = lora_A.contiguous() + # peft_dict[peft_key+".lora_B.weight"] = lora_B.contiguous() + +for n,p in pretrained_model.named_parameters(): + p.data.copy_(diff[n]) + +pretrained_model.save_pretrained(save_dir) +finetuned_tokenizer.save_pretrained(save_dir) + +# other_dict = {k: v.to(torch.float16) for k, v in other_dict.items()} + +# other_para_path = "/home/wanghanqing/projects/exp/mAlign_exp/lang_LoRAs/peft_ver/trim_lora/code/other_param" +# torch.save(other_dict, os.path.join(other_para_path, "other.pt")) +# torch.save(other_dict, os.path.join(other_para_path, "pretrain_other.pt")) + + +# peft_dict = {k: v.to(torch.float16) for k, v in peft_dict.items()} + +# layernum = 40 +# for lnum in range(layernum): +# peft_pfx = f"base_model.model.model.layers.{lnum}" +# delta_pfx = f"encoder.layers.{lnum}" +# malign_dict[f"{delta_pfx}.self_att.self_attention.project_q_lora.lora_A.weight"] = peft_dict[f"{peft_pfx}.self_attn.q_proj.lora_A.weight"].contiguous() +# malign_dict[f"{delta_pfx}.self_att.self_attention.project_q_lora.lora_B.weight"] = peft_dict[f"{peft_pfx}.self_attn.q_proj.lora_B.weight"].contiguous() +# malign_dict[f"{delta_pfx}.self_att.self_attention.project_k_lora.lora_A.weight"] = peft_dict[f"{peft_pfx}.self_attn.k_proj.lora_A.weight"].contiguous() +# malign_dict[f"{delta_pfx}.self_att.self_attention.project_k_lora.lora_B.weight"] = peft_dict[f"{peft_pfx}.self_attn.k_proj.lora_B.weight"].contiguous() +# malign_dict[f"{delta_pfx}.self_att.self_attention.project_v_lora.lora_A.weight"] = peft_dict[f"{peft_pfx}.self_attn.v_proj.lora_A.weight"].contiguous() +# malign_dict[f"{delta_pfx}.self_att.self_attention.project_v_lora.lora_B.weight"] = peft_dict[f"{peft_pfx}.self_attn.v_proj.lora_B.weight"].contiguous() +# malign_dict[f"{delta_pfx}.self_att.self_attention.attention_out_lora.lora_A.weight"] = peft_dict[f"{peft_pfx}.self_attn.o_proj.lora_A.weight"].contiguous() +# malign_dict[f"{delta_pfx}.self_att.self_attention.attention_out_lora.lora_B.weight"] = peft_dict[f"{peft_pfx}.self_attn.o_proj.lora_B.weight"].contiguous() +# malign_dict[f"{delta_pfx}.ffn.ffn.w_in.w_0_lora.lora_A.weight"] = peft_dict[f"{peft_pfx}.mlp.gate_proj.lora_A.weight"].contiguous() +# malign_dict[f"{delta_pfx}.ffn.ffn.w_in.w_0_lora.lora_B.weight"] = peft_dict[f"{peft_pfx}.mlp.gate_proj.lora_B.weight"].contiguous() +# malign_dict[f"{delta_pfx}.ffn.ffn.w_in.w_1_lora.lora_A.weight"] = peft_dict[f"{peft_pfx}.mlp.up_proj.lora_A.weight"].contiguous() +# malign_dict[f"{delta_pfx}.ffn.ffn.w_in.w_1_lora.lora_B.weight"] = peft_dict[f"{peft_pfx}.mlp.up_proj.lora_B.weight"].contiguous() +# malign_dict[f"{delta_pfx}.ffn.ffn.w_out_lora.lora_A.weight"] = peft_dict[f"{peft_pfx}.mlp.down_proj.lora_A.weight"].contiguous() +# malign_dict[f"{delta_pfx}.ffn.ffn.w_out_lora.lora_B.weight"] = peft_dict[f"{peft_pfx}.mlp.down_proj.lora_B.weight"].contiguous() + + + + + +# malign_dict = {k: v.to(torch.float16) for k, v in malign_dict.items()} + +# import pdb +# pdb.set_trace() + +output_peft_path = "/home/wanghanqing/projects/exp/mAlign_exp/lang_LoRAs/peft_ver/trim_lora/dim256_2/code" +output_malign_path = "/home/wanghanqing/projects/exp/mAlign_exp/mAlign_LoRAs/trim_lora/dim256_2/code" + +# torch.save(peft_dict, os.path.join(output_peft_path, "adapter_model.bin")) +# torch.save(malign_dict, os.path.join(output_malign_path, "lora.pt")) + + +print("--end--") + + +# for param_name, param_value in finetuned_model.named_parameters(): +# if param_name in masked_param_dict: +# param_value.data.copy_(masked_param_dict[param_name]) + +# logger.info(f"saving model at {save_model_path}...") +# os.makedirs(save_model_path, exist_ok=True) +# finetuned_model.save_pretrained(save_directory=save_model_path) +# finetuned_tokenizer.save_pretrained(save_directory=save_model_path) +# logger.info(f"model is saved") \ No newline at end of file diff --git a/test.py b/test.py new file mode 100644 index 0000000..efdb2bf --- /dev/null +++ b/test.py @@ -0,0 +1,110 @@ +import argparse +import transformers +import torch +from transformers import AutoConfig, AutoModelForCausalLM +from accelerate import infer_auto_device_map, init_empty_weights +import torch.nn as nn +import os +from llava.model.language_model.llava_llama import LlavaConfig +from transformers import AutoTokenizer, AutoModelForCausalLM +from llava.model import * + +def get_tokenizer(tokenizer_name): + tokenizer = transformers.AutoTokenizer.from_pretrained( + tokenizer_name, use_fast=False, + ) + + if tokenizer.pad_token_id is None: + if tokenizer.eos_token_id is not None: + tokenizer.pad_token_id = tokenizer.eos_token_id + else: + tokenizer.pad_token_id = 0 + + return tokenizer + +@torch.no_grad() +def load_diff(model, diff_dir): + device = model.device + diff_dict = torch.load(diff_dir) + + for name, module in model.named_modules(): + if name + ".mask" in diff_dict: + coeff = diff_dict[name + ".coeff"].to(device) + mask = diff_dict[name + ".mask"].to(device) + + setattr(module, "mask", mask) + setattr(module, "coeff", coeff) + # module.weight.add_((mask * coeff).to(module.weight.dtype)) + elif name + ".weight" in diff_dict: + module.weight = nn.Parameter(diff_dict[name + ".weight"].to(device).to(module.weight.dtype)) + + elif name + '.A' in diff_dict: + A = diff_dict[name + '.A'].to(device) + B = diff_dict[name + '.B'].to(device) + + mask = (A @ B).T + module.weight.add_(mask.to(module.weight.dtype)) + + model.config.vocab_size = model.lm_head.weight.size(0) + + +def get_model(model_name, device, memory_map=None): + # multi-gpu + if device == "auto" or isinstance(device, list): + + # if gpus are specified, distributes according to the memory map + if isinstance(device, list): + assert memory_map is not None, "memory_map must be specified when using multiple gpus" + config = AutoConfig.from_pretrained(model_name) + with init_empty_weights(): + model = AutoModelForCausalLM.from_config(config) + + device_map = infer_auto_device_map(model, memory_map, no_split_module_classes=["LlamaDecoderLayer"]) + + else: + # use all available gpus + device_map = "auto" + + return transformers.AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype=torch.bfloat16, + device_map=device_map, + ) + else: # single-gpu or cpu + return transformers.AutoModelForCausalLM.from_pretrained( + model_name, + # torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, + ) + + +def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, device): + base_model = get_model(base_model_name, device) + tokenizer = get_tokenizer(finetuned_model_name) + load_diff(base_model, diff_dir) + + base_model.save_pretrained(save_dir) + tokenizer.save_pretrained(save_dir) + + del base_model + +model_path = "/home/pingbowen/models/Llava-v1-vicuna/Llava-v1/" + +lora_cfg_pretrained = LlavaConfig.from_pretrained(model_path) +tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False) +print('Loading LLaVA from base model...') +model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs) +token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features +if model.lm_head.weight.shape[0] != token_num: + model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype)) + model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype)) + + + +# base_model = get_model("/home/pingbowen/models/Llava-v1-vicuna/Llava-v1/", "cuda") +# params = base_model.state_dict() + +# print(params.keys()) + +# get_tokenizer("/data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/") +# save_full_model("/data/public/opensource_models/meta-llama/Llama-2-7b-hf/", "/data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/", os.path.join("/home/pingbowen/workspace/delta-compression/BitDelta/save", "diff_untrained.pt"), os.path.join("/home/pingbowen/workspace/delta-compression/BitDelta/save", "uncalibrated_model"), device="cuda") \ No newline at end of file From 86d443d17d33a221a413aade0e5f1cafa0d77e0b Mon Sep 17 00:00:00 2001 From: pingbowen Date: Tue, 12 Mar 2024 15:07:43 +0800 Subject: [PATCH 02/14] finish fp16 + 1bit, check diff2.py train2.py --- bitdelta/diff.py | 62 +++++++++------- bitdelta/diff2.py | 173 +++++++++++++++++++++++++++++++++++++++++++++ bitdelta/train.py | 11 ++- bitdelta/train2.py | 14 ++-- run.sh | 4 +- tailor.py | 108 ++++++++++++++-------------- test.py | 25 ++++--- 7 files changed, 296 insertions(+), 101 deletions(-) create mode 100644 bitdelta/diff2.py diff --git a/bitdelta/diff.py b/bitdelta/diff.py index faa1bbb..2f97ab7 100644 --- a/bitdelta/diff.py +++ b/bitdelta/diff.py @@ -9,7 +9,7 @@ class BinaryDiff(nn.Module): def __init__(self, base, finetune): super().__init__() diff = finetune - base - # diff = decomposition(diff, 2048) + diff = decomposition(diff, st=64, ed=1024) quantile = diff.float().abs().mean() mask = torch.ones_like(diff) @@ -66,17 +66,28 @@ def compress_submodule(name, subname, module, submodule): setattr(module, subname, compressed) # TODO: this can be parallelized + # flag = False for name, module in finetuned_compressed_model.named_modules(): - if "mlp" in name or "self_attn" in name: - - if Pass(layers,name) == True: - continue - + # if flag == True: + # break + + if "self_attn" in name: for subname, submodule in module.named_children(): if "proj" in subname: compress_submodule(name, subname, module, submodule) - -def save_diff(finetuned_compressed_model, save_dir,layers=None): + elif "mlp" in name: + with torch.no_grad(): + for subname, submodule in module.named_children(): + if "proj" in subname: + base_weight = base_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device) + finetuned_weight = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device) + delta = decomposition(finetuned_weight - base_weight,dim=int(128 * 1.45)) + finetuned_compressed_model.get_submodule(f"{name}.{subname}").weight.copy_(base_weight + delta.to(torch.bfloat16)) + # flag = True + # import pdb; pdb.set_trace() + # break + +def save_diff(finetuned_compressed_model, save_dir,layers=None,ori_diff=None): diff_dict = {} for name, module in finetuned_compressed_model.named_modules(): @@ -92,9 +103,10 @@ def save_diff(finetuned_compressed_model, save_dir,layers=None): torch.save(diff_dict, save_dir) @torch.no_grad() -def load_diff(model, diff_dir): +def load_diff(model, diff_dir,ori_diff): device = model.device diff_dict = torch.load(diff_dir) + # ori_diff = torch.load(ori_diff) for name, module in model.named_modules(): if name + ".mask" in diff_dict: @@ -104,13 +116,15 @@ def load_diff(model, diff_dir): # setattr(module, "mask", mask) # setattr(module, "coeff", coeff) weight = (unpack(mask)*2-1) * coeff - - if "mlp" in name: - weight = decomposition(weight, 1024) + weight_fp16 = decomposition(ori_diff[name + ".weight"].to(torch.float32), dim=64).to(torch.bfloat16) + # import pdb; pdb.set_trace() - module.weight.add_(weight.T.to(module.weight.dtype)) + module.weight.add_(weight_fp16.to(module.weight.dtype) + weight.T.to(module.weight.dtype)) elif name + ".weight" in diff_dict: module.weight = nn.Parameter(diff_dict[name + ".weight"].to(device).to(module.weight.dtype)) + + # if "mlp" in name: + # import pdb; pdb.set_trace() elif name + '.A' in diff_dict: A = diff_dict[name + '.A'].to(device) @@ -121,17 +135,18 @@ def load_diff(model, diff_dir): model.config.vocab_size = model.lm_head.weight.size(0) -def decomposition(masked_input_tensor,dim): - # if "mlp" in name: - # dim = int(dim * 1.45) +def decomposition(masked_input_tensor,dim=None,st=None,ed=None): + U , S , V = torch.svd(masked_input_tensor.to(torch.float32)) + + if dim is not None: + U , S , V = U[:, :dim],S[:dim] ,V[:, :dim] + + if st is not None and ed is not None: + U , S , V = U[:, st:ed],S[st:ed] ,V[:, st:ed] - U , S , V = torch.svd(masked_input_tensor) - # total_sum , partial_sum = torch.sum(S) , torch.sum(S[:128]) - # import pdb; pdb.set_trace() - U , S , V = U[:, :dim],S[:dim] ,V[:, :dim] return torch.mm(torch.mm(U, torch.diag(S)), V.t()) -def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, device,layers=None): +def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, device,layers=None,ori_diff=None): base_model = get_model(base_model_name, device) tokenizer = get_tokenizer(finetuned_model_name) @@ -150,17 +165,14 @@ def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, d # # import pdb; pdb.set_trace() # params[k] = decomposition(delta.to(torch.float32), dim).to(torch.bfloat16) - # import pdb; pdb.set_trace() # dict(base_model.named_parameters())['model.layers.0.self_attn.o_proj.weight'] # with torch.no_grad(): # for param in params: # base_model.get_submodule(param.replace('.weight',"")).weight.add_(params[param].detach().to(device)) - # import pdb; pdb.set_trace() - load_diff(base_model, diff_dir) + load_diff(base_model, diff_dir,ori_diff=ori_diff) - base_model.save_pretrained(save_dir) tokenizer.save_pretrained(save_dir) diff --git a/bitdelta/diff2.py b/bitdelta/diff2.py new file mode 100644 index 0000000..30c986c --- /dev/null +++ b/bitdelta/diff2.py @@ -0,0 +1,173 @@ +import torch +import torch.nn as nn +import gc + +from bitdelta.binary_gemm_kernel import pack, unpack, binary_bmm +from bitdelta.utils import get_model, get_tokenizer + +class BinaryDiff(nn.Module): + def __init__(self, weight): + super().__init__() + diff = weight + quantile = diff.float().abs().mean() + + mask = torch.ones_like(diff) + mask[diff < 0] = 0 + mask = pack(mask.bool().T) + + self.register_buffer("mask", mask) + # self.register_buffer("base", base.T) + self.register_parameter( + "coeff", + nn.Parameter( + torch.tensor( + quantile, + dtype=torch.float32, + requires_grad=True, + device=weight.device, + ) + ), + ) + # del base, finetune, diff + + def forward(self, x): + # print(x.shape, self.base.shape, self.coeff.shape, self.mask.shape) + # [B, seq, in] @ [in, out] + [B, seq, in] @ [B, in/32, out] + + # TODO: This can be faster + repeated_mask = self.mask.unsqueeze(0).repeat(x.size(0), 1, 1) + return x @ self.base + self.coeff * binary_bmm(x, repeated_mask) + +def Pass(layers=None,name=None): + if layers is not None: + for layer in layers: + if layer in name: + return True + return False + + +def compress_diff(base_model, finetuned_model, finetuned_compressed_model,save_dir,layers=None): + def compress_submodule(name, subname, module, submodule): + target_device = submodule.weight.device + + base_weight = base_model.get_submodule(f"{name}.{subname}").weight.detach().to(target_device) + finetuned_weight = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(target_device) + + compressed = BinaryDiff( + base=base_weight, + finetune=finetuned_weight, + ).to(target_device) + + del submodule, base_weight + setattr(module, subname, None) + gc.collect() + torch.cuda.empty_cache() + setattr(module, subname, compressed) + + # TODO: this can be parallelized + for name, module in finetuned_compressed_model.named_modules(): + + if "self_attn" in name: + for subname, submodule in module.named_children(): + if "proj" in subname: + base_weight = base_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device) + finetuned_weight = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device) + # compress_submodule(name, subname, module, submodule) + U,S,V = decomposition(finetuned_weight - base_weight,dim=1024) + + compressed_U, compressed_V = BinaryDiff(weight=U[:,64:]).to(finetuned_weight.device), BinaryDiff(weight=V[:,64:]).to(finetuned_weight.device) + U_mask, U_coeff, V_mask, V_coeff = compressed_U.mask, compressed_U.coeff, compressed_V.mask, compressed_V.coeff + weight_U , weight_V = (unpack(U_mask)*2-1) * U_coeff, (unpack(V_mask)*2-1) * V_coeff + # import pdb; pdb.set_trace() + U[:,64:] , V[:,64:] = weight_U.T, weight_V.T # 不确定是否有bug + delta = U @ torch.diag(S) @ V.t() + with torch.no_grad(): + finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(base_weight + delta.to(torch.bfloat16)) + + + elif "mlp" in name: + with torch.no_grad(): + for subname, submodule in module.named_children(): + if "proj" in subname: + base_weight = base_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device) + finetuned_weight = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device) + U,S,V = decomposition(finetuned_weight - base_weight,dim=int(128 * 1.45)) + delta = torch.mm(torch.mm(U, torch.diag(S)), V.t()) + finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(base_weight + delta.to(torch.bfloat16)) + + + finetuned_model.save_pretrained(save_dir) + +def save_diff(finetuned_compressed_model, save_dir,layers=None,ori_diff=None): + diff_dict = {} + + for name, module in finetuned_compressed_model.named_modules(): + if isinstance(module, BinaryDiff): + # diff_dict[name + ".mask"] = (module.mask == 1).bool().cpu() + diff_dict[name + ".mask"] = module.mask.cpu() + diff_dict[name + ".coeff"] = module.coeff.cpu() + + for name, param in finetuned_compressed_model.named_parameters(): + if param.requires_grad: + diff_dict[name] = param.cpu() + + torch.save(diff_dict, save_dir) + +@torch.no_grad() +def load_diff(model, diff_dir,ori_diff): + device = model.device + diff_dict = torch.load(diff_dir) + # ori_diff = torch.load(ori_diff) + + for name, module in model.named_modules(): + if name + ".mask" in diff_dict: + coeff = diff_dict[name + ".coeff"].to(device) + mask = diff_dict[name + ".mask"].to(device) + + # setattr(module, "mask", mask) + # setattr(module, "coeff", coeff) + weight = (unpack(mask)*2-1) * coeff + weight_fp16 = decomposition(ori_diff[name + ".weight"].to(torch.float32), dim=64).to(torch.bfloat16) + # import pdb; pdb.set_trace() + + module.weight.add_(weight_fp16.to(module.weight.dtype) + weight.T.to(module.weight.dtype)) + elif name + ".weight" in diff_dict: + module.weight = nn.Parameter(diff_dict[name + ".weight"].to(device).to(module.weight.dtype)) + + # if "mlp" in name: + # import pdb; pdb.set_trace() + + elif name + '.A' in diff_dict: + A = diff_dict[name + '.A'].to(device) + B = diff_dict[name + '.B'].to(device) + + mask = (A @ B).T + module.weight.add_(mask.to(module.weight.dtype)) + + model.config.vocab_size = model.lm_head.weight.size(0) + +def decomposition(masked_input_tensor,dim=None,st=None,ed=None,name=None): + U , S , V = torch.svd(masked_input_tensor.to(torch.float32)) + + if dim is not None: + U , S , V = U[:, :dim],S[:dim] ,V[:, :dim] + + if st is not None and ed is not None: + U , S , V = U[:, st:ed],S[st:ed] ,V[:, st:ed] + + return U, S, V + +def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, device,layers=None,ori_diff=None): + base_model = get_model(base_model_name, device) + tokenizer = get_tokenizer(finetuned_model_name) + + finetuned_model = get_model(finetuned_model_name, device) + # params = {} + + load_diff(base_model, diff_dir,ori_diff=ori_diff) + + base_model.save_pretrained(save_dir) + tokenizer.save_pretrained(save_dir) + + del base_model + diff --git a/bitdelta/train.py b/bitdelta/train.py index 6ab3825..9e4bf97 100644 --- a/bitdelta/train.py +++ b/bitdelta/train.py @@ -23,6 +23,13 @@ base_model = get_model(args.base_model, args.base_model_device, args.base_model_memory_map) finetuned_model = get_model(args.finetuned_model, args.finetuned_model_device, args.finetuned_model_memory_map) +def original_diff(base_model, finetuned_model): + origin_diff = {} + for k, v in finetuned_model.named_parameters(): + if "mlp" in k or "self_attn" in k: + origin_diff[k] = v.detach().cpu() - base_model.get_submodule(k.replace('.weight',"")).weight.detach().cpu() + return origin_diff + # get corr/stddev stats if args.debug: print(f"finding corr/stddev stats...") @@ -94,6 +101,8 @@ with open(os.path.join(args.save_dir, f"train_loss_{args.num_groups}.json"), "w") as f: json.dump(train_loss_list, f) +ori_diff = original_diff(base_model, finetuned_model) + # # save trained delta save_diff(finetuned_compressed_model, os.path.join(args.save_dir, "diff.pt"),layers=args.layers) @@ -102,6 +111,6 @@ if args.save_full_model: print("saving uncalibrated model") - save_full_model(args.base_model, args.finetuned_model, os.path.join(args.save_dir, "diff_untrained.pt"), os.path.join(args.save_dir, f"uncalibrated_model"), device="cpu",layers=args.layers) + save_full_model(args.base_model, args.finetuned_model, os.path.join(args.save_dir, "diff_untrained.pt"), os.path.join(args.save_dir, f"uncalibrated_model"), device="cpu",layers=args.layers,ori_diff=ori_diff) # print("saving calibrated model") # save_full_model(args.base_model, args.finetuned_model, os.path.join(args.save_dir, "diff.pt"), os.path.join(args.save_dir, "calibrated_model"), device="cpu") diff --git a/bitdelta/train2.py b/bitdelta/train2.py index 37c9c70..eb9d66d 100644 --- a/bitdelta/train2.py +++ b/bitdelta/train2.py @@ -3,7 +3,7 @@ import torch import torch.nn.functional as F -from bitdelta.diff import compress_diff, save_diff, save_full_model +from bitdelta.diff2 import compress_diff, save_diff, save_full_model from bitdelta.misc import find_corr_stddev from bitdelta.utils import get_model, parse_args, get_tokenizer @@ -17,7 +17,7 @@ # create save_dir if it doesn't exist os.makedirs(args.save_dir, exist_ok=True) -tokenizer = get_tokenizer(args.base_model) +tokenizer = get_tokenizer(args.finetuned_model) with torch.no_grad(): base_model = get_model(args.base_model, args.base_model_device, args.base_model_memory_map) @@ -26,12 +26,6 @@ finetuned_compressed_model = get_model(args.finetuned_model, args.finetuned_compressed_model_device, args.finetuned_compressed_model_memory_map) print(f"compressing diff...") -compress_diff(base_model, finetuned_model, finetuned_compressed_model) +compress_diff(base_model, finetuned_model, finetuned_compressed_model,args.save_dir) -# save untrained delta -save_diff(finetuned_compressed_model, os.path.join(args.save_dir, "diff_untrained.pt")) - - -if args.save_full_model: - print("saving uncalibrated model") - save_full_model(args.base_model, args.finetuned_model, os.path.join(args.save_dir, "diff_untrained.pt"), os.path.join(args.save_dir, "uncalibrated_model"), device="cpu") +tokenizer.save_pretrained(args.save_dir) diff --git a/run.sh b/run.sh index ef3b529..8874caf 100644 --- a/run.sh +++ b/run.sh @@ -1,9 +1,9 @@ -MODEL_SAVE_DIR=save/ +MODEL_SAVE_DIR=save/uncalibrated_model_0 mkdir -p $MODEL_SAVE_DIR CUDA_VISIBLE_DEVICES=6,7 python \ - bitdelta/train.py \ + bitdelta/train2.py \ --base_model /data/public/opensource_models/meta-llama/Llama-2-7b-hf/ \ --finetuned_model /data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/ \ --save_dir $MODEL_SAVE_DIR \ diff --git a/tailor.py b/tailor.py index cf3143e..270bdaa 100755 --- a/tailor.py +++ b/tailor.py @@ -15,20 +15,22 @@ import re import random import numpy as np +import math -pretrained_model_name = "/data/public/opensource_models/meta-llama/Llama-2-7b-hf" +parser = argparse.ArgumentParser() +parser.add_argument('--finetuned_model_name', type=str, required=True, help='finetuned model name') +parser.add_argument('--save_dir', type=str, required=True, help='finetuned model name') +args = parser.parse_args() -finetuned_model_name = "/data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf" # /data/public/wangshuo/exp/ft-en-magicoder-llama-2-7b/ckpts/checkpoints/epoch_2_hf +pretrained_model_name = "/data/public/opensource_models/meta-llama/Llama-2-7b-hf" -pretrained_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=pretrained_model_name, +finetuned_model_name = args.finetuned_model_name # /data/public/wangshuo/exp/ft-en-magicoder-llama-2-7b/ckpts/checkpoints/epoch_2_hf +pretrained_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=pretrained_model_name, device_map="cpu") pretrained_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=pretrained_model_name) -finetuned_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=finetuned_model_name, +finetuned_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=finetuned_model_name, device_map="cpu") finetuned_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=finetuned_model_name) - -save_dir = "/home/pingbowen/workspace/delta-compression/BitDelta/save/uncalibrated_model" - def set_random_seed(seed: int = 0): """ set random seed @@ -47,6 +49,7 @@ def set_random_seed(seed: int = 0): # scale_factor = finetuned_model.config.intermediate_size / finetuned_model.config.hidden_size +scale_factor = 1.45 def get_param_names_to_merge(input_param_names: list, exclude_param_names_regex: list): """ get the names of parameters that need to be merged @@ -62,17 +65,6 @@ def get_param_names_to_merge(input_param_names: list, exclude_param_names_regex: return param_names_to_merge - -task_vector_param_dict = {} -pretrained_param_dict = {param_name: param_value for param_name, param_value in pretrained_model.named_parameters()} -finetuned_param_dict = {param_name: param_value for param_name, param_value in finetuned_model.named_parameters()} -# param_names_to_merge = get_param_names_to_merge(input_param_names=list(pretrained_param_dict.keys()), exclude_param_names_regex=[]) -# with torch.no_grad(): -# for param_name in finetuned_param_dict.keys(): -# task_vector_param_dict[param_name] = finetuned_param_dict[param_name] - pretrained_param_dict[param_name] -# print(f"name {param_name} data {task_vector_param_dict[param_name]} ") - - # import pdb # pdb.set_trace() @@ -81,12 +73,11 @@ def decomposition(masked_input_tensor,dim): U , S , V = torch.svd(masked_input_tensor) U , S , V = U[:, :dim],S[:dim],V[:, :dim] # return torch.mm(U, torch.diag(S)), V.t() - # return U, torch.mm(torch.diag(S), V.t()) #return lora_B, lora_A - return torch.mm(torch.mm(U, torch.diag(S)), V.t()) + return torch.mm(U, torch.mm(torch.diag(S), V.t())) #return lora_B, lora_A -# dim = 256 +# dim = 1024 dim = 128 -# dim = 16 +# dim = 64 print("----------------------dim: ",dim) print("----------------------dim: ",dim) print("----------------------dim: ",dim) @@ -98,35 +89,34 @@ def decomposition(masked_input_tensor,dim): malign_dict = {} other_dict = {} -# finetuned_param_dict -# for param_name, param_value in tqdm(pretrained_param_dict.items()): +task_vector_param_dict = {} +pretrained_param_dict = {param_name: param_value for param_name, param_value in pretrained_model.named_parameters()} +finetuned_param_dict = {param_name: param_value for param_name, param_value in finetuned_model.named_parameters()} +param_names_to_merge = get_param_names_to_merge(input_param_names=list(pretrained_param_dict.keys()), exclude_param_names_regex=[]) +with torch.no_grad(): + for param_name in param_names_to_merge: + if "self_attn" in param_name or "mlp" in param_name: + # import pdb ;pdb.set_trace() + if "mlp" in param_name: + dim = math.ceil(dim * scale_factor) + + delta = decomposition(finetuned_param_dict[param_name] - pretrained_param_dict[param_name],dim=dim) + finetuned_model.get_submodule(param_name.replace(".weight", "")).weight.copy_(pretrained_model.get_submodule(param_name.replace(".weight", "")).weight + delta) + # print(f"name {param_name} data {task_vector_param_dict[param_name]} ") + + +finetuned_model.save_pretrained(save_directory=args.save_dir) +finetuned_tokenizer.save_pretrained(save_directory=args.save_dir) + +# for param_name, param_value in tqdm(task_vector_param_dict.items()): # if "self_attn" in param_name or "mlp" in param_name: -# pass -# else: -# other_dict[param_name] = param_value.contiguous() - -diff = dict() - -for param_name, param_value in tqdm(finetuned_param_dict.items()): - if "self_attn" in param_name or "mlp" in param_name: - delta = param_value - pretrained_param_dict[param_name] - if "mlp" in param_name: - dim = int(dim * 1.45) - delta = decomposition(delta,dim=dim) - diff[param_name] = (pretrained_param_dict[param_name] + delta).contiguous() - else: - diff[param_name] = param_value.contiguous() - # lora_A = lora_A * (dim/16) ###补偿scaling, 以后的alpha可以统一为16 - # peft_key = "base_model.model." + param_name.split(".weight")[0] - # print(peft_key+".lora_A.weight") - # peft_dict[peft_key+".lora_A.weight"] = lora_A.contiguous() - # peft_dict[peft_key+".lora_B.weight"] = lora_B.contiguous() - -for n,p in pretrained_model.named_parameters(): - p.data.copy_(diff[n]) - -pretrained_model.save_pretrained(save_dir) -finetuned_tokenizer.save_pretrained(save_dir) +# lora_B, lora_A = decomposition(param_value,dim=dim) +# lora_A = lora_A * (dim/16) ###补偿scaling, 以后的alpha可以统一为16 +# peft_key = "base_model.model." + param_name.split(".weight")[0] +# print(peft_key+".lora_A.weight") +# peft_dict[peft_key+".lora_A.weight"] = lora_A.contiguous() +# peft_dict[peft_key+".lora_B.weight"] = lora_B.contiguous() + # other_dict = {k: v.to(torch.float16) for k, v in other_dict.items()} @@ -135,7 +125,7 @@ def decomposition(masked_input_tensor,dim): # torch.save(other_dict, os.path.join(other_para_path, "pretrain_other.pt")) -# peft_dict = {k: v.to(torch.float16) for k, v in peft_dict.items()} +peft_dict = {k: v.to(torch.float16) for k, v in peft_dict.items()} # layernum = 40 # for lnum in range(layernum): @@ -160,7 +150,7 @@ def decomposition(masked_input_tensor,dim): -# malign_dict = {k: v.to(torch.float16) for k, v in malign_dict.items()} +malign_dict = {k: v.to(torch.float16) for k, v in malign_dict.items()} # import pdb # pdb.set_trace() @@ -175,6 +165,20 @@ def decomposition(masked_input_tensor,dim): print("--end--") + + + +# num , masked_input_tensor = 0,input_tensor +# if "self_attn" in param_name or "mlp" in param_name: +# if "mlp" in param_name: +# dim = math.ceil(dim * scale_factor) +# thresh_hold = 0.06752 +# num, masked_input_tensor = decomposition(input_tensor,dim=dim) + + + + + # for param_name, param_value in finetuned_model.named_parameters(): # if param_name in masked_param_dict: # param_value.data.copy_(masked_param_dict[param_name]) diff --git a/test.py b/test.py index efdb2bf..1d8fe23 100644 --- a/test.py +++ b/test.py @@ -5,9 +5,9 @@ from accelerate import infer_auto_device_map, init_empty_weights import torch.nn as nn import os -from llava.model.language_model.llava_llama import LlavaConfig +# from llava.model.language_model.llava_llama import LlavaConfig from transformers import AutoTokenizer, AutoModelForCausalLM -from llava.model import * +# from llava.model import * def get_tokenizer(tokenizer_name): tokenizer = transformers.AutoTokenizer.from_pretrained( @@ -88,18 +88,21 @@ def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, d del base_model -model_path = "/home/pingbowen/models/Llava-v1-vicuna/Llava-v1/" -lora_cfg_pretrained = LlavaConfig.from_pretrained(model_path) -tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False) -print('Loading LLaVA from base model...') -model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs) -token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features -if model.lm_head.weight.shape[0] != token_num: - model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype)) - model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype)) +A = torch.Tensor([[1, 2, 3],[6,5,4]]) +B = torch.Tensor([[9],[9]]) +A[:,-1:] = B +print(A) +# U,S,V = torch.svd(A) +# # print("-----------------") + +# print(A.shape) +# print("-----------------") +# print(S.shape) +# print("-----------------") +# print(V) # base_model = get_model("/home/pingbowen/models/Llava-v1-vicuna/Llava-v1/", "cuda") # params = base_model.state_dict() From 6836149e19514a49132f904604fc8e84d11c6c35 Mon Sep 17 00:00:00 2001 From: pingbowen Date: Thu, 14 Mar 2024 09:30:55 +0800 Subject: [PATCH 03/14] Attn,mlp fp16+1bit --- bitdelta/diff2.py | 30 ++--- cosine_sim_check.py | 281 ++++++++++++++++++++++++++++++++++++++++++++ run.sh | 2 +- run_tailor.sh | 11 ++ 4 files changed, 305 insertions(+), 19 deletions(-) create mode 100644 cosine_sim_check.py create mode 100644 run_tailor.sh diff --git a/bitdelta/diff2.py b/bitdelta/diff2.py index 30c986c..56ba348 100644 --- a/bitdelta/diff2.py +++ b/bitdelta/diff2.py @@ -64,16 +64,22 @@ def compress_submodule(name, subname, module, submodule): torch.cuda.empty_cache() setattr(module, subname, compressed) - # TODO: this can be parallelized + # TODO: 根据thresh 选择压缩比例 for name, module in finetuned_compressed_model.named_modules(): - - if "self_attn" in name: + if "self_attn" in name or "mlp" in name: for subname, submodule in module.named_children(): if "proj" in subname: base_weight = base_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device) finetuned_weight = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device) - # compress_submodule(name, subname, module, submodule) - U,S,V = decomposition(finetuned_weight - base_weight,dim=1024) + dim , thresh = 1024,0.7 + + if "mlp" in name: + dim , thresh = 2048 , 0.24 + + U,S,V = decomposition(finetuned_weight - base_weight,dim=dim) + energy_total = torch.sum(S**2) + energy_top_percent = torch.sum(S[:50]**2) + ratio = energy_top_percent / energy_total compressed_U, compressed_V = BinaryDiff(weight=U[:,64:]).to(finetuned_weight.device), BinaryDiff(weight=V[:,64:]).to(finetuned_weight.device) U_mask, U_coeff, V_mask, V_coeff = compressed_U.mask, compressed_U.coeff, compressed_V.mask, compressed_V.coeff @@ -82,19 +88,7 @@ def compress_submodule(name, subname, module, submodule): U[:,64:] , V[:,64:] = weight_U.T, weight_V.T # 不确定是否有bug delta = U @ torch.diag(S) @ V.t() with torch.no_grad(): - finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(base_weight + delta.to(torch.bfloat16)) - - - elif "mlp" in name: - with torch.no_grad(): - for subname, submodule in module.named_children(): - if "proj" in subname: - base_weight = base_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device) - finetuned_weight = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device) - U,S,V = decomposition(finetuned_weight - base_weight,dim=int(128 * 1.45)) - delta = torch.mm(torch.mm(U, torch.diag(S)), V.t()) - finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(base_weight + delta.to(torch.bfloat16)) - + finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(base_weight + delta.to(base_weight.dtype)) finetuned_model.save_pretrained(save_dir) diff --git a/cosine_sim_check.py b/cosine_sim_check.py new file mode 100644 index 0000000..788937a --- /dev/null +++ b/cosine_sim_check.py @@ -0,0 +1,281 @@ +import os +os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" +import torch +from torch import nn +import gc +import torch.nn.functional as F +from bitdelta.diff import save_diff, save_full_model +from bitdelta.misc import find_corr_stddev +from bitdelta.binary_gemm_kernel import pack, unpack, binary_bmm +from bitdelta.utils import get_model, parse_args, get_tokenizer +from tqdm import tqdm +from bitdelta.data import get_dataset, get_dataloader + +import json +import transformers + +import re +import random +import numpy as np + +def set_random_seed(seed: int = 0): + """ + set random seed + :param seed: int, random seed + :return: + """ + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + +set_random_seed(seed=0) + +def get_param_names_to_merge(input_param_names: list, exclude_param_names_regex: list): + """ + get the names of parameters that need to be merged + :param input_param_names: list, names of input parameters + :param exclude_param_names_regex: list, regular expression of names of parameters that need to be excluded + :return: + """ + param_names_to_merge = [] + for param_name in input_param_names: + exclude = any([re.match(exclude_pattern, param_name) for exclude_pattern in exclude_param_names_regex]) + if not exclude: + param_names_to_merge.append(param_name) + return param_names_to_merge + + +def get_model(model_path): + if "mistral" in model_path or "mixtral" in model_path: + data_type = torch.bfloat16 + else: + data_type = torch.float16 + with torch.no_grad(): + model = transformers.AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype=data_type, + low_cpu_mem_usage=True, + # device_map="auto" + ).to("cuda") + return model + + + + +def singular_values_for_variance(tensor, variances=[0.9, 0.95]): + """ + Calculate the minimum number of singular values needed to reach specified variance ratios. + + Parameters: + - tensor: A 2D tensor for which to calculate the SVD. + - variances: A list of variance ratios to calculate the minimum number of singular values for. + + Returns: + A dictionary with the variance ratios as keys and the minimum number of singular values needed as values. + """ + # Compute SVD + U, S, V = torch.svd(tensor) + # Calculate the squared singular values (proportional to variance explained) + squared_singular_values = torch.pow(S, 2) + total_variance = torch.sum(squared_singular_values) + cumulative_variance_ratios = torch.cumsum(squared_singular_values, dim=0) / total_variance + + # Find the minimum number of singular values for each specified variance + results = {} + for variance in variances: + num_singular_values = torch.searchsorted(cumulative_variance_ratios, variance) + 1 # +1 because indices start at 0 + results[variance] = num_singular_values.item() + + return results + + +def cosine_similarity_matrix(finetuned_param, pretrained_param): + finetuned_flat = finetuned_param.view(-1) + pretrained_flat = pretrained_param.view(-1) + cosine_similarity = F.cosine_similarity(finetuned_flat.unsqueeze(0), pretrained_flat.unsqueeze(0), dim=1) + return cosine_similarity.item() + + +def check_delta_properties(delta_weight): + # analysis properties for each linear weight in deltas + + # 计算矩阵的Frobenius范数(二范数) + matrix_norm = torch.norm(delta_weight, p='fro') + + # 计算矩阵的条件数 + # 矩阵的条件数(Condition Number)衡量的是矩阵求逆的数值稳定性。具体来说,它描述了原始数据的微小变化如何影响矩阵运算的结果。条件数越高,计算结果对数据的微小变化越敏感,即数值解可能不稳定;条件数越低,矩阵和其运算则越稳定。 + + # 定义 + # 对于非奇异矩阵 A,其条件数定义为矩阵 A 的范数与 A 的逆的范数的乘积: + # 其中,范数可以是任意矩阵范数,但是最常用的是2-范数(即谱范数),此时条件数可以解释为矩阵最大奇异值与最小奇异值的比值。 + cond_number = torch.linalg.cond(delta_weight) + + # 计算矩阵的秩 + rank = torch.linalg.matrix_rank(delta_weight) + + # 计算矩阵的有效秩 + rank_eff = singular_values_for_variance(delta_weight, variances=[0.9, 0.95]) + rank_90, rank_95 = rank_eff[0.9], rank_eff[0.95] + + + return matrix_norm, cond_number, rank, rank_90, rank_95 + + + + + ## First part: checkout cosine similarity in first layer FFN w1 + + # if "llama" in base_model_path: + # #weight_key = "model.layers.0.mlp.gate_proj.weight" + # tensor_base = base_model.model.layers[0].mlp.gate_proj.weight + # tensor_ft = finetuned_model.model.layers[0].mlp.gate_proj.weight + # cosine_sim = F.cosine_similarity(tensor_base, tensor_ft, dim=1) + # overall_similarity = cosine_sim.mean() + # base_model_name = base_model_path.split("/")[-1] + # finetuned_model_name = finetuned_model_path.split("/")[-1] + # overall_similarity_result = overall_similarity.item() + # print(f"Overall Cosine Similarity between {base_model_name} and {finetuned_model_name}: {overall_similarity_result}") + # ## 说明是llama模型 + # elif "Mixtral" in base_model_path: + # tensor_base = base_model.model.layers[0].block_sparse_moe.experts[0].w1.weight + # tensor_ft = base_model.model.layers[0].block_sparse_moe.experts[1].w1.weight + # cosine_sim = F.cosine_similarity(tensor_base, tensor_ft, dim=1) + # overall_similarity = cosine_sim.mean() + + + + + ## Second part: checkout delta square decline potential using scaled weight + + ## third part: checkout rank of original delta and + ## scaled calculation delta(relation between variance ratio and #singular values) + +def analysis_delta(base_model_path, finetuned_model_path): + pretrained_model = get_model(base_model_path) + finetuned_model = get_model(finetuned_model_path) + print(f"We are analysising the delta between the Pretrained model: {base_model_path} and the Finetuned model: {finetuned_model_path}") + task_vector_param_dict = {} + pretrained_param_dict = {param_name: param_value for param_name, param_value in pretrained_model.named_parameters()} + finetuned_param_dict = {param_name: param_value for param_name, param_value in finetuned_model.named_parameters()} + param_names_to_merge = get_param_names_to_merge(input_param_names=list(pretrained_param_dict.keys()), exclude_param_names_regex=[]) + + cos_sim_list = [] + norm_list = [] + cond_number_list = [] + rank_list = [] + rank_90_list = [] + rank_95_list = [] + + with torch.no_grad(): + for param_name in param_names_to_merge: + param_list = ['q_proj','k_proj','v_proj','o_proj','gate_proj','up_proj','down_proj'] + if all(char not in param_name for char in param_list): + continue + # import pdb + # pdb.set_trace() + #研究finetuned_param_dict[param_name]和pretrained_param_dict[param_name]的cosine similarity + task_vector_param_dict[param_name] = finetuned_param_dict[param_name] - pretrained_param_dict[param_name] + #check similarity + print(f"Investigating param_name: {param_name}") + cos_sim = cosine_similarity_matrix(finetuned_param_dict[param_name].float(), pretrained_param_dict[param_name].float()) + cos_sim_list.append(cos_sim) + print(f"cosine similarity between the finetuned model and pretrained model: ",cos_sim) + #研究他们差值的统计性质 + matrix_norm, cond_number, rank, rank_90, rank_95 = check_delta_properties(task_vector_param_dict[param_name].float()) + norm_list.append(matrix_norm) + cond_number_list.append(cond_number) + rank_list.append(rank) + rank_90_list.append(rank_90) + rank_95_list.append(rank_95) + print(f"Properties of Delta Weight---matrix_norm: {matrix_norm}, cond_number: {cond_number}, rank: {rank}, rank_90: {rank_90}, rank_95: {rank_95}") + + print(f"avg_cos_sim: {sum(cos_sim_list)/len(cos_sim_list)}") + print(f"avg_norm: {sum(norm_list)/len(norm_list)}") + print(f"avg_cond_number: {sum(cond_number_list)/len(cond_number_list)}") + print(f"avg_rank: {sum(rank_list)/len(rank_list)}") + print(f"avg_rank_90: {sum(rank_90_list)/len(rank_90_list)}") + print(f"avg_rank_95: {sum(rank_95_list)/len(rank_95_list)}") + + print(f"Analysis end for the pretrained model: {base_model_path} and finetuned_model: {finetuned_model_path}") + del pretrained_model + del finetuned_model + return + +moe_base = "/home/wanghanqing/projects/models/model_ver2/Mixtral-8x7B-v0.1" +instruct_base = "/home/wanghanqing/projects/models/model_ver2/Mistral-7B-Instruct-v0.2" +base_model = "/home/wanghanqing/projects/models/model_ver2/Mistral-7B-v0.1" + +code_llama13 = "/data/public/opensource_models/codellama/codellama-13b-python-hf" +wizard_coder = "/data/public/opensource_models/WizardLM/WizardCoder-Python-13B-V1.0" +llama2_7b = "/data/public/opensource_models/meta-llama/Llama-2-7b-hf" +llama2_7b_chat = "/data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf" +llama2_13b = "/data/public/opensource_models/meta-llama/Llama-2-13b-hf" +llama2_13b_chat = "/data/public/opensource_models/meta-llama/Llama-2-13b-chat-hf" +wizard_math_7b = "/data/public/opensource_models/WizardLM/WizardMath-7B-V1.0" +wizard_math_13b = "/data/public/opensource_models/WizardLM/WizardMath-13B-V1.0" +meta_math_7b = "/data/public/wangshuo/exp/ft-en-metameth-llama-2-7b/ckpts/checkpoints/epoch_2_hf" +magicoder_7b = "/data/public/wangshuo/exp/ft-en-magicoder-llama-2-7b/ckpts/checkpoints/epoch_2_hf" +magicoder_13b = "/data/public/wangshuo/exp/ft-en-magicoder-llama-2-13b/ckpts/checkpoints/epoch_2_hf" + + +# Mistral-7B +## base +mistral_7b = "/home/wanghanqing/projects/models/model_ver2/Mistral-7B-v0.1" +## finetuned +mistral_7b_instruct_v1 = "/home/wanghanqing/projects/models/model_ver2/Mistral-7B-Instruct-v0.1" +mistral_7b_instruct_v2 = "/home/wanghanqing/projects/models/model_ver2/Mistral-7B-Instruct-v0.2" + +# llama2-7b +## base +llama2_7b = "/data/public/opensource_models/meta-llama/Llama-2-7b-hf" +## finetuned +llama2_7b_chat = "/data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf" +wizard_math_7b = "/data/public/opensource_models/WizardLM/WizardMath-7B-V1.0" +meta_math_7b = "/data/public/wangshuo/exp/ft-en-metameth-llama-2-7b/ckpts/checkpoints/epoch_2_hf" +magicoder_7b = "/data/public/wangshuo/exp/ft-en-magicoder-llama-2-7b/ckpts/checkpoints/epoch_2_hf" + +# llama2-13b +## base +llama2_13b = "/data/public/opensource_models/meta-llama/Llama-2-13b-hf" +## finetuned +llama2_13b_chat = "/data/public/opensource_models/meta-llama/Llama-2-13b-chat-hf" +wizard_math_13b = "/data/public/opensource_models/WizardLM/WizardMath-13B-V1.0" +magicoder_13b = "/data/public/wangshuo/exp/ft-en-magicoder-llama-2-13b/ckpts/checkpoints/epoch_2_hf" +code_llama13 = "/data/public/opensource_models/codellama/codellama-13b-python-hf" +wizard_coder = "/data/public/opensource_models/WizardLM/WizardCoder-Python-13B-V1.0" + + + + +import sys + +# 打开一个日志文件 +log_file = open("analysis_log.txt", "w") + +# 保存原始的标准输出 +original_stdout = sys.stdout + +# 重定向标准输出到文件 +sys.stdout = log_file + +# 你的代码,所有print函数的输出都会写入log.txt +print("This will be written to analysis_log.txt") + + + + + +analysis_delta(base_model_path = llama2_7b, finetuned_model_path = llama2_7b_chat) +analysis_delta(base_model_path = llama2_7b, finetuned_model_path = wizard_math_7b) +analysis_delta(base_model_path = llama2_7b, finetuned_model_path = meta_math_7b) +analysis_delta(base_model_path = llama2_7b, finetuned_model_path = magicoder_7b) + +# 恢复原始的标准输出 +sys.stdout = original_stdout + +# 关闭日志文件 +log_file.close() \ No newline at end of file diff --git a/run.sh b/run.sh index 8874caf..eb40a99 100644 --- a/run.sh +++ b/run.sh @@ -1,4 +1,4 @@ -MODEL_SAVE_DIR=save/uncalibrated_model_0 +MODEL_SAVE_DIR=save/uncalibrated_model_attn_1024_mlp_2048 mkdir -p $MODEL_SAVE_DIR diff --git a/run_tailor.sh b/run_tailor.sh new file mode 100644 index 0000000..71b7393 --- /dev/null +++ b/run_tailor.sh @@ -0,0 +1,11 @@ +python \ + tailor.py \ + --finetuned_model_name /data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf \ + --save_dir /home/pingbowen/workspace/delta-compression/BitDelta/tailor_model/7b_chat \ + + +# & + +# python3 tailor.py \ +# --finetuned_model_name /data/public/wangshuo/exp/ft-en-metameth-llama-2-7b/ckpts/checkpoints/epoch_2_hf \ +# --save_dir /home/pingbowen/workspace/delta-compression/BitDelta/tailor_model/math_lora_7b \ \ No newline at end of file From 5ea7c52fb6727d31fddb01a762e4639c60270d1c Mon Sep 17 00:00:00 2001 From: pingbowen Date: Sat, 16 Mar 2024 11:14:55 +0800 Subject: [PATCH 04/14] delta orthogonal --- bitdelta/diff2.py | 60 ++++++++++++++++++++++++++++++++++++++++++++-- bitdelta/train2.py | 4 ++-- run.sh | 10 ++++---- test.py | 31 ++++++++++++++---------- 4 files changed, 83 insertions(+), 22 deletions(-) diff --git a/bitdelta/diff2.py b/bitdelta/diff2.py index 56ba348..1303f27 100644 --- a/bitdelta/diff2.py +++ b/bitdelta/diff2.py @@ -1,7 +1,7 @@ import torch import torch.nn as nn import gc - +import torch.nn.functional as F from bitdelta.binary_gemm_kernel import pack, unpack, binary_bmm from bitdelta.utils import get_model, get_tokenizer @@ -46,6 +46,32 @@ def Pass(layers=None,name=None): return False +def solve_orthogonal(p, f): + # 计算x + delta ,n , sacled_p = f - p, p.shape[-1],p + + # import pdb; pdb.set_trace() + + for i in range(n): + p_i,f_i = p[:,i],f[:,i] + dot_fp , dot_pd = torch.dot(f_i, p_i) , torch.dot(p_i, delta[:,i]) + + if dot_fp == 0 or dot_pd == 0: # p_i或f_i是零向量,因为低秩, 边界p_i与delta_i直接正交 + continue + + dot_pp = torch.dot(p_i, p_i) + x = dot_fp / dot_pp if dot_pp != 0 else None + + + # 计算(f - xp) + with torch.no_grad(): + delta[:,i].data.copy_(f_i - x * p_i) if x is not None else None + sacled_p[:,i].data.copy_(sacled_p[:,i].data * x) if x is not None else None + + # import pdb; pdb.set_trace() + + return delta , sacled_p + def compress_diff(base_model, finetuned_model, finetuned_compressed_model,save_dir,layers=None): def compress_submodule(name, subname, module, submodule): target_device = submodule.weight.device @@ -68,6 +94,35 @@ def compress_submodule(name, subname, module, submodule): for name, module in finetuned_compressed_model.named_modules(): if "self_attn" in name or "mlp" in name: for subname, submodule in module.named_children(): + + with torch.no_grad(): + if "proj" in subname: + p = base_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device) + f = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device) + dim = 128 + + if "mlp" in name: + dim = int(128 * 1.45) + + delta , scaled_p = solve_orthogonal(p, f) + U,S,V = decomposition(delta,dim=dim) + delta = U @ torch.diag(S) @ V.t() + finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(scaled_p.to(p.dtype) + delta.to(p.dtype)) + + ''' + if torch.sum(torch.abs(delta_pre)) > torch.sum(torch.abs(delta)): + U,S,V = decomposition(delta,dim=128) + delta = U @ torch.diag(S) @ V.t() + finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(scaled_p.to(p.dtype) + delta.to(p.dtype)) + else: + U,S,V = decomposition(delta_pre,dim=128) + delta_pre = U @ torch.diag(S) @ V.t() + finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(p.to(p.dtype) + delta_pre.to(p.dtype)) + ''' + + ''' + fp 16 + 1bit + if "proj" in subname: base_weight = base_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device) finetuned_weight = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device) @@ -89,7 +144,8 @@ def compress_submodule(name, subname, module, submodule): delta = U @ torch.diag(S) @ V.t() with torch.no_grad(): finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(base_weight + delta.to(base_weight.dtype)) - + ''' + finetuned_model.to(torch.bfloat16) finetuned_model.save_pretrained(save_dir) def save_diff(finetuned_compressed_model, save_dir,layers=None,ori_diff=None): diff --git a/bitdelta/train2.py b/bitdelta/train2.py index eb9d66d..e87ff5e 100644 --- a/bitdelta/train2.py +++ b/bitdelta/train2.py @@ -20,8 +20,8 @@ tokenizer = get_tokenizer(args.finetuned_model) with torch.no_grad(): - base_model = get_model(args.base_model, args.base_model_device, args.base_model_memory_map) - finetuned_model = get_model(args.finetuned_model, args.finetuned_model_device, args.finetuned_model_memory_map) + base_model = get_model(args.base_model, args.base_model_device, args.base_model_memory_map).to(torch.float32) + finetuned_model = get_model(args.finetuned_model, args.finetuned_model_device, args.finetuned_model_memory_map).to(torch.float32) finetuned_compressed_model = get_model(args.finetuned_model, args.finetuned_compressed_model_device, args.finetuned_compressed_model_memory_map) diff --git a/run.sh b/run.sh index eb40a99..5a7bcc6 100644 --- a/run.sh +++ b/run.sh @@ -1,15 +1,15 @@ -MODEL_SAVE_DIR=save/uncalibrated_model_attn_1024_mlp_2048 +MODEL_SAVE_DIR=save/uncalibrated_model_orthogonal_math mkdir -p $MODEL_SAVE_DIR -CUDA_VISIBLE_DEVICES=6,7 python \ +CUDA_VISIBLE_DEVICES=5,6 python \ bitdelta/train2.py \ --base_model /data/public/opensource_models/meta-llama/Llama-2-7b-hf/ \ --finetuned_model /data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/ \ --save_dir $MODEL_SAVE_DIR \ --batch_size 4 \ --num_steps 200 \ - --save_full_model True + --save_full_model True \ + # &> test.log - # --layers "layers.5."\ - # /data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/ + # /data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/ diff --git a/test.py b/test.py index 1d8fe23..7e0136f 100644 --- a/test.py +++ b/test.py @@ -1,10 +1,12 @@ import argparse import transformers +import os +os.environ["CUDA_VISIBLE_DEVICES"] = "7" import torch from transformers import AutoConfig, AutoModelForCausalLM from accelerate import infer_auto_device_map, init_empty_weights import torch.nn as nn -import os +import torch.nn.functional as F # from llava.model.language_model.llava_llama import LlavaConfig from transformers import AutoTokenizer, AutoModelForCausalLM # from llava.model import * @@ -89,22 +91,25 @@ def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, d del base_model -A = torch.Tensor([[1, 2, 3],[6,5,4]]) -B = torch.Tensor([[9],[9]]) +device = "cuda" if torch.cuda.is_available() else "cpu" + +# model = AutoModelForCausalLM.from_pretrained("/data/public/opensource_models/meta-llama/Llama-2-7b-hf/").to(device).to(torch.bfloat16) +# k = model.get_submodule("model.layers.0.self_attn.k_proj").weight + +a = torch.rand(4096) / 1000 +b = torch.rand(4096) / 1000 + +# a , b = a.to(torch.bfloat16) , b.to(torch.bfloat16) + +dot_fp , dot_pp = torch.dot(a, b) , torch.dot(b, b) -A[:,-1:] = B +x = dot_fp / dot_pp -print(A) -# U,S,V = torch.svd(A) -# # print("-----------------") +cosine_sim = F.cosine_similarity(a,b,dim=0) -# print(A.shape) -# print("-----------------") -# print(S.shape) -# print("-----------------") -# print(V) +cosine_sim2 = F.cosine_similarity(b,a - x * b,dim=0) -# base_model = get_model("/home/pingbowen/models/Llava-v1-vicuna/Llava-v1/", "cuda") +import pdb; pdb.set_trace() # params = base_model.state_dict() # print(params.keys()) From 7f2339df6e27e1a03d44a9ce08baf5231dd71ad3 Mon Sep 17 00:00:00 2001 From: pingbowen Date: Mon, 18 Mar 2024 19:22:42 +0800 Subject: [PATCH 05/14] orthogonal --- bitdelta/diff2.py | 30 ++++++++++++++++++++++++++++-- run.sh | 2 +- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/bitdelta/diff2.py b/bitdelta/diff2.py index 1303f27..f303ca7 100644 --- a/bitdelta/diff2.py +++ b/bitdelta/diff2.py @@ -72,6 +72,25 @@ def solve_orthogonal(p, f): return delta , sacled_p +def get_outlier(tensor, percent=0.5): + # 计算保留的元素数量 + num_elements = tensor.numel() + num_to_keep = int(num_elements * percent / 100) + + # 展平张量并获取最大和最小的元素的索引 + flat_tensor = tensor.flatten() + _, top_indices = torch.topk(flat_tensor, num_to_keep, largest=True) + _, bottom_indices = torch.topk(flat_tensor, num_to_keep, largest=False) + + # 创建一个全零张量 + result = torch.zeros_like(tensor) + + # 仅在指定位置放置最大和最小的元素 + result.view(-1)[top_indices] = tensor.view(-1)[top_indices] + result.view(-1)[bottom_indices] = tensor.view(-1)[bottom_indices] + + return result + def compress_diff(base_model, finetuned_model, finetuned_compressed_model,save_dir,layers=None): def compress_submodule(name, subname, module, submodule): target_device = submodule.weight.device @@ -99,13 +118,20 @@ def compress_submodule(name, subname, module, submodule): if "proj" in subname: p = base_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device) f = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device) - dim = 128 + dim , fp16_col = 1024 , 64 if "mlp" in name: - dim = int(128 * 1.45) + fp16_col = 128 delta , scaled_p = solve_orthogonal(p, f) U,S,V = decomposition(delta,dim=dim) + + compressed_U, compressed_V = BinaryDiff(weight=U[:,fp16_col:]).to(f.device), BinaryDiff(weight=V[:,fp16_col:]).to(f.device) + U_mask, U_coeff, V_mask, V_coeff = compressed_U.mask, compressed_U.coeff, compressed_V.mask, compressed_V.coeff + weight_U , weight_V = (unpack(U_mask)*2-1) * U_coeff, (unpack(V_mask)*2-1) * V_coeff + U[:,fp16_col:] , V[:,fp16_col:] = weight_U.T, weight_V.T + + delta = U @ torch.diag(S) @ V.t() finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(scaled_p.to(p.dtype) + delta.to(p.dtype)) diff --git a/run.sh b/run.sh index 5a7bcc6..266e5a1 100644 --- a/run.sh +++ b/run.sh @@ -1,4 +1,4 @@ -MODEL_SAVE_DIR=save/uncalibrated_model_orthogonal_math +MODEL_SAVE_DIR=save/uncalibrated_model_orthogonal_mix_math mkdir -p $MODEL_SAVE_DIR From 19c1d3eb225a9083f71b7c7bbc707d953ac57cd3 Mon Sep 17 00:00:00 2001 From: pingbowen Date: Tue, 19 Mar 2024 19:16:43 +0800 Subject: [PATCH 06/14] add outlier --- bitdelta/diff2.py | 84 +++++++++++++++++++++++++++++++---------------- run.sh | 2 +- test.py | 45 +++++++++++++++++++++++-- 3 files changed, 99 insertions(+), 32 deletions(-) diff --git a/bitdelta/diff2.py b/bitdelta/diff2.py index f303ca7..c0b6887 100644 --- a/bitdelta/diff2.py +++ b/bitdelta/diff2.py @@ -86,11 +86,19 @@ def get_outlier(tensor, percent=0.5): result = torch.zeros_like(tensor) # 仅在指定位置放置最大和最小的元素 - result.view(-1)[top_indices] = tensor.view(-1)[top_indices] - result.view(-1)[bottom_indices] = tensor.view(-1)[bottom_indices] + result = result.flatten() + result[top_indices] = flat_tensor[top_indices] + result[bottom_indices] = flat_tensor[bottom_indices] + result = result.reshape(tensor.shape) return result - + +def copy_nonzero_values(A, B): + # 复制B中非零值到A的对应位置 + mask = B != 0 + A[mask] = B[mask] + return A + def compress_diff(base_model, finetuned_model, finetuned_compressed_model,save_dir,layers=None): def compress_submodule(name, subname, module, submodule): target_device = submodule.weight.device @@ -118,33 +126,28 @@ def compress_submodule(name, subname, module, submodule): if "proj" in subname: p = base_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device) f = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device) - dim , fp16_col = 1024 , 64 - if "mlp" in name: - fp16_col = 128 - - delta , scaled_p = solve_orthogonal(p, f) - U,S,V = decomposition(delta,dim=dim) + delta , outlier_U, outlier_V = f - p , None, None + dim , fp16_col = 1024, 64 + if "self_attn" in name: + U,S,V,outlier_U,outlier_V = decomposition(delta,dim=dim,name=name) + else: + dim , fp16_col = 1024 , 128 + # delta , scaled_p = solve_orthogonal(p, f) + U,S,V,outlier_U,outlier_V = decomposition(delta,dim=dim,name=name) + compressed_U, compressed_V = BinaryDiff(weight=U[:,fp16_col:]).to(f.device), BinaryDiff(weight=V[:,fp16_col:]).to(f.device) U_mask, U_coeff, V_mask, V_coeff = compressed_U.mask, compressed_U.coeff, compressed_V.mask, compressed_V.coeff weight_U , weight_V = (unpack(U_mask)*2-1) * U_coeff, (unpack(V_mask)*2-1) * V_coeff - U[:,fp16_col:] , V[:,fp16_col:] = weight_U.T, weight_V.T - + U[:,fp16_col:] , V[:,fp16_col:] = weight_U.T, weight_V.T + + # import pdb; pdb.set_trace() + if outlier_U is not None and outlier_V is not None: + copy_nonzero_values(U[:,fp16_col:], outlier_U) , copy_nonzero_values(V[:,fp16_col:], outlier_V) - delta = U @ torch.diag(S) @ V.t() - finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(scaled_p.to(p.dtype) + delta.to(p.dtype)) - - ''' - if torch.sum(torch.abs(delta_pre)) > torch.sum(torch.abs(delta)): - U,S,V = decomposition(delta,dim=128) - delta = U @ torch.diag(S) @ V.t() - finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(scaled_p.to(p.dtype) + delta.to(p.dtype)) - else: - U,S,V = decomposition(delta_pre,dim=128) - delta_pre = U @ torch.diag(S) @ V.t() - finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(p.to(p.dtype) + delta_pre.to(p.dtype)) - ''' + delta = U @ torch.diag(S) @ V.t() + finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(p.to(p.dtype) + delta.to(p.dtype)) ''' fp 16 + 1bit @@ -171,6 +174,7 @@ def compress_submodule(name, subname, module, submodule): with torch.no_grad(): finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(base_weight + delta.to(base_weight.dtype)) ''' + # import pdb ; pdb.set_trace() finetuned_model.to(torch.bfloat16) finetuned_model.save_pretrained(save_dir) @@ -222,16 +226,40 @@ def load_diff(model, diff_dir,ori_diff): model.config.vocab_size = model.lm_head.weight.size(0) -def decomposition(masked_input_tensor,dim=None,st=None,ed=None,name=None): +def set_zero(A, B): + # 复制B中非零值到A的对应位置 + mask = B != 0 + A[mask] = 0 + return A + + +def decomposition(masked_input_tensor,dim=None,name=None): U , S , V = torch.svd(masked_input_tensor.to(torch.float32)) + outlier_U , outlier_V = None, None + if dim is not None: U , S , V = U[:, :dim],S[:dim] ,V[:, :dim] - if st is not None and ed is not None: - U , S , V = U[:, st:ed],S[st:ed] ,V[:, st:ed] + if "self_attn" in name: + outlier_U = get_outlier(U[:,64:], percent=0.2) + outlier_V = get_outlier(V[:,64:], percent=0.2) + + set_zero(U[:,64:], outlier_U) + set_zero(V[:,64:], outlier_V) + + else: + outlier_U = get_outlier(U[:,128:], percent=0.1) + outlier_V = get_outlier(V[:,128:], percent=0.1) + + set_zero(U[:,128:], outlier_U) + set_zero(V[:,128:], outlier_V) - return U, S, V + # max_val, min_val, mean_abs_val = round(torch.max(U).item(),4), round(torch.min(U).item(),4), round(torch.mean(torch.abs(U)).item(),4) + + # print(f"max_val {max_val} pos_min {round(torch.min(outlier[outlier > 0]).item(),4)} mean_abs_val {mean_abs_val} ratio {round(torch.min(outlier[outlier > 0]).item() / mean_abs_val,4)}") + # import pdb; pdb.set_trace() + return U, S, V , outlier_U, outlier_V def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, device,layers=None,ori_diff=None): base_model = get_model(base_model_name, device) diff --git a/run.sh b/run.sh index 266e5a1..6eceba8 100644 --- a/run.sh +++ b/run.sh @@ -1,4 +1,4 @@ -MODEL_SAVE_DIR=save/uncalibrated_model_orthogonal_mix_math +MODEL_SAVE_DIR=save/uncalibrated_model mkdir -p $MODEL_SAVE_DIR diff --git a/test.py b/test.py index 7e0136f..9709030 100644 --- a/test.py +++ b/test.py @@ -93,9 +93,6 @@ def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, d device = "cuda" if torch.cuda.is_available() else "cpu" -# model = AutoModelForCausalLM.from_pretrained("/data/public/opensource_models/meta-llama/Llama-2-7b-hf/").to(device).to(torch.bfloat16) -# k = model.get_submodule("model.layers.0.self_attn.k_proj").weight - a = torch.rand(4096) / 1000 b = torch.rand(4096) / 1000 @@ -109,6 +106,48 @@ def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, d cosine_sim2 = F.cosine_similarity(b,a - x * b,dim=0) +def filter_top_and_bottom_percent(tensor, percent=0.5): + # 计算保留的元素数量 + num_elements = tensor.numel() + num_to_keep = int(num_elements * percent / 100) + + # 展平张量并获取最大和最小的元素的索引 + flat_tensor = tensor.flatten() + _, top_indices = torch.topk(flat_tensor, num_to_keep, largest=True) + _, bottom_indices = torch.topk(flat_tensor, num_to_keep, largest=False) + + # 创建一个全零张量 + result = torch.zeros_like(tensor) + + # 仅在指定位置放置最大和最小的元素 + result = result.flatten() + result[top_indices] = flat_tensor[top_indices] + result[bottom_indices] = flat_tensor[bottom_indices] + result = result.reshape(tensor.shape) + + return result + +def copy_nonzero_values(A, B): + # 复制B中非零值到A的对应位置 + mask = B != 0 + A[mask] = B[mask] + return A + + +# 示例 +n = 4 +A = torch.randn(n, n) # 随机生成一个n × n的张量A +B = torch.zeros(n, n) # 创建一个n × n的全零张量B + +# 在B中随机设置一些非零值 +indices = torch.randint(0, n, (3, 2)) # 随机选择一些位置 +for i, j in indices: + B[i, j] = torch.randn(1).item() # 随机非零值 + +# 复制B中的非零值到A +updated_A = copy_nonzero_values(A, B) + + import pdb; pdb.set_trace() # params = base_model.state_dict() From 46d46ca852d757ab0838ae071686781ad54bca09 Mon Sep 17 00:00:00 2001 From: pingbowen Date: Wed, 20 Mar 2024 18:31:42 +0800 Subject: [PATCH 07/14] for test --- bitdelta/diff2.py | 9 +++++++-- test.py | 10 ++-------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/bitdelta/diff2.py b/bitdelta/diff2.py index c0b6887..e6166a4 100644 --- a/bitdelta/diff2.py +++ b/bitdelta/diff2.py @@ -5,6 +5,9 @@ from bitdelta.binary_gemm_kernel import pack, unpack, binary_bmm from bitdelta.utils import get_model, get_tokenizer +# 离群值抽出之后 原来位置设定成多少,如果设置成0会让分母增大 +# U, V + class BinaryDiff(nn.Module): def __init__(self, weight): super().__init__() @@ -142,9 +145,10 @@ def compress_submodule(name, subname, module, submodule): weight_U , weight_V = (unpack(U_mask)*2-1) * U_coeff, (unpack(V_mask)*2-1) * V_coeff U[:,fp16_col:] , V[:,fp16_col:] = weight_U.T, weight_V.T - # import pdb; pdb.set_trace() + if outlier_U is not None and outlier_V is not None: - copy_nonzero_values(U[:,fp16_col:], outlier_U) , copy_nonzero_values(V[:,fp16_col:], outlier_V) + tmp = copy_nonzero_values(U[:,fp16_col:], outlier_U) , copy_nonzero_values(V[:,fp16_col:], outlier_V) + # import pdb; pdb.set_trace() delta = U @ torch.diag(S) @ V.t() finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(p.to(p.dtype) + delta.to(p.dtype)) @@ -246,6 +250,7 @@ def decomposition(masked_input_tensor,dim=None,name=None): outlier_V = get_outlier(V[:,64:], percent=0.2) set_zero(U[:,64:], outlier_U) + # import pdb; pdb.set_trace() set_zero(V[:,64:], outlier_V) else: diff --git a/test.py b/test.py index 9709030..ab02ab3 100644 --- a/test.py +++ b/test.py @@ -138,14 +138,8 @@ def copy_nonzero_values(A, B): n = 4 A = torch.randn(n, n) # 随机生成一个n × n的张量A B = torch.zeros(n, n) # 创建一个n × n的全零张量B - -# 在B中随机设置一些非零值 -indices = torch.randint(0, n, (3, 2)) # 随机选择一些位置 -for i, j in indices: - B[i, j] = torch.randn(1).item() # 随机非零值 - -# 复制B中的非零值到A -updated_A = copy_nonzero_values(A, B) +A = A.flatten() +values , top_indices = torch.topk(A, 1, largest=True) import pdb; pdb.set_trace() From 840417cea482888fdac0863447e0269801aaec95 Mon Sep 17 00:00:00 2001 From: pingbowen Date: Thu, 21 Mar 2024 08:20:42 +0800 Subject: [PATCH 08/14] bitdelta outlier --- bitdelta/diff.py | 75 ++++++++++++++++++++++++++++++++--------------- bitdelta/diff2.py | 2 +- bitdelta/train.py | 5 ++++ run.sh | 2 +- 4 files changed, 59 insertions(+), 25 deletions(-) diff --git a/bitdelta/diff.py b/bitdelta/diff.py index 2f97ab7..594b936 100644 --- a/bitdelta/diff.py +++ b/bitdelta/diff.py @@ -9,7 +9,9 @@ class BinaryDiff(nn.Module): def __init__(self, base, finetune): super().__init__() diff = finetune - base - diff = decomposition(diff, st=64, ed=1024) + outlier = get_outlier(diff, percent=0.02) + set_zero(diff, outlier) + # import pdb; pdb.set_trace() quantile = diff.float().abs().mean() mask = torch.ones_like(diff) @@ -18,6 +20,7 @@ def __init__(self, base, finetune): self.register_buffer("mask", mask) self.register_buffer("base", base.T) + self.register_buffer("outlier", outlier) self.register_parameter( "coeff", nn.Parameter( @@ -39,13 +42,38 @@ def forward(self, x): repeated_mask = self.mask.unsqueeze(0).repeat(x.size(0), 1, 1) return x @ self.base + self.coeff * binary_bmm(x, repeated_mask) -def Pass(layers=None,name=None): - if layers is not None: - for layer in layers: - if layer in name: - return True - return False +def set_zero(A, B): + # 复制B中非零值到A的对应位置 + mask = B != 0 + A[mask] = 0 + return A +def get_outlier(tensor, percent=0.5): + # 计算保留的元素数量 + num_elements = tensor.numel() + num_to_keep = int(num_elements * percent / 100) + + # 展平张量并获取最大和最小的元素的索引 + flat_tensor = tensor.flatten() + _, top_indices = torch.topk(flat_tensor, num_to_keep, largest=True) + _, bottom_indices = torch.topk(flat_tensor, num_to_keep, largest=False) + + # 创建一个全零张量 + result = torch.zeros_like(tensor) + + # 仅在指定位置放置最大和最小的元素 + result = result.flatten() + result[top_indices] = flat_tensor[top_indices] + result[bottom_indices] = flat_tensor[bottom_indices] + result = result.reshape(tensor.shape) + + return result + +def copy_nonzero_values(A, B): + # 复制B中非零值到A的对应位置 + mask = B != 0 + A[mask] = B[mask] + return A def compress_diff(base_model, finetuned_model, finetuned_compressed_model,layers=None): def compress_submodule(name, subname, module, submodule): @@ -67,25 +95,26 @@ def compress_submodule(name, subname, module, submodule): # TODO: this can be parallelized # flag = False - for name, module in finetuned_compressed_model.named_modules(): - # if flag == True: - # break - - if "self_attn" in name: - for subname, submodule in module.named_children(): - if "proj" in subname: - compress_submodule(name, subname, module, submodule) - elif "mlp" in name: - with torch.no_grad(): + with torch.no_grad(): + for name, module in finetuned_model.named_modules(): + if "self_attn" in name or "mlp" in name: for subname, submodule in module.named_children(): if "proj" in subname: - base_weight = base_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device) - finetuned_weight = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device) - delta = decomposition(finetuned_weight - base_weight,dim=int(128 * 1.45)) - finetuned_compressed_model.get_submodule(f"{name}.{subname}").weight.copy_(base_weight + delta.to(torch.bfloat16)) - # flag = True + p , f = base_model.get_submodule(f"{name}.{subname}").weight.detach() , finetuned_model.get_submodule(f"{name}.{subname}").weight.detach() + + compressed = BinaryDiff(base=p, finetune=f) + mask, coeff , outlier = compressed.mask, compressed.coeff, compressed.outlier + weight = (unpack(mask)*2-1) * coeff + weight = weight.T.to(outlier.dtype) + + copy_nonzero_values(weight, outlier) # import pdb; pdb.set_trace() - # break + finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(p.to(p.dtype) + weight.to(p.dtype)) + + finetuned_model.save_pretrained("/home/pingbowen/workspace/delta-compression/BitDelta/save/test") + + + def save_diff(finetuned_compressed_model, save_dir,layers=None,ori_diff=None): diff_dict = {} diff --git a/bitdelta/diff2.py b/bitdelta/diff2.py index e6166a4..f60b6dd 100644 --- a/bitdelta/diff2.py +++ b/bitdelta/diff2.py @@ -147,7 +147,7 @@ def compress_submodule(name, subname, module, submodule): if outlier_U is not None and outlier_V is not None: - tmp = copy_nonzero_values(U[:,fp16_col:], outlier_U) , copy_nonzero_values(V[:,fp16_col:], outlier_V) + copy_nonzero_values(U[:,fp16_col:], outlier_U) , copy_nonzero_values(V[:,fp16_col:], outlier_V) # import pdb; pdb.set_trace() delta = U @ torch.diag(S) @ V.t() diff --git a/bitdelta/train.py b/bitdelta/train.py index 9e4bf97..a4a44e5 100644 --- a/bitdelta/train.py +++ b/bitdelta/train.py @@ -46,6 +46,10 @@ def original_diff(base_model, finetuned_model): print(f"compressing diff...") compress_diff(base_model, finetuned_model, finetuned_compressed_model,layers=args.layers) +tokenizer.save_pretrained("/home/pingbowen/workspace/delta-compression/BitDelta/save/test") + + +''' train_num_samples = args.batch_size * args.num_steps train_dataset = get_dataset( args.dataset_name, @@ -114,3 +118,4 @@ def original_diff(base_model, finetuned_model): save_full_model(args.base_model, args.finetuned_model, os.path.join(args.save_dir, "diff_untrained.pt"), os.path.join(args.save_dir, f"uncalibrated_model"), device="cpu",layers=args.layers,ori_diff=ori_diff) # print("saving calibrated model") # save_full_model(args.base_model, args.finetuned_model, os.path.join(args.save_dir, "diff.pt"), os.path.join(args.save_dir, "calibrated_model"), device="cpu") +''' \ No newline at end of file diff --git a/run.sh b/run.sh index 6eceba8..cc5a5c4 100644 --- a/run.sh +++ b/run.sh @@ -3,7 +3,7 @@ MODEL_SAVE_DIR=save/uncalibrated_model mkdir -p $MODEL_SAVE_DIR CUDA_VISIBLE_DEVICES=5,6 python \ - bitdelta/train2.py \ + bitdelta/train.py \ --base_model /data/public/opensource_models/meta-llama/Llama-2-7b-hf/ \ --finetuned_model /data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/ \ --save_dir $MODEL_SAVE_DIR \ From 7c186f444c7f91cefd2c365956f47ec76d61aefc Mon Sep 17 00:00:00 2001 From: pingbowen Date: Thu, 21 Mar 2024 18:33:23 +0800 Subject: [PATCH 09/14] modify --- run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run.sh b/run.sh index cc5a5c4..6eceba8 100644 --- a/run.sh +++ b/run.sh @@ -3,7 +3,7 @@ MODEL_SAVE_DIR=save/uncalibrated_model mkdir -p $MODEL_SAVE_DIR CUDA_VISIBLE_DEVICES=5,6 python \ - bitdelta/train.py \ + bitdelta/train2.py \ --base_model /data/public/opensource_models/meta-llama/Llama-2-7b-hf/ \ --finetuned_model /data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/ \ --save_dir $MODEL_SAVE_DIR \ From 8723564be15671746bcba4b4dd42989d751fd4e1 Mon Sep 17 00:00:00 2001 From: pingbowen Date: Mon, 25 Mar 2024 15:27:16 +0800 Subject: [PATCH 10/14] outlier --- bitdelta/diff2.py | 16 ++++++++-------- bitdelta/train2.py | 6 +++--- bitdelta/utils.py | 2 ++ run.sh | 13 ++++++++++--- 4 files changed, 23 insertions(+), 14 deletions(-) diff --git a/bitdelta/diff2.py b/bitdelta/diff2.py index f60b6dd..8489ccc 100644 --- a/bitdelta/diff2.py +++ b/bitdelta/diff2.py @@ -102,7 +102,7 @@ def copy_nonzero_values(A, B): A[mask] = B[mask] return A -def compress_diff(base_model, finetuned_model, finetuned_compressed_model,save_dir,layers=None): +def compress_diff(base_model, finetuned_model, finetuned_compressed_model,save_dir,args): def compress_submodule(name, subname, module, submodule): target_device = submodule.weight.device @@ -134,11 +134,11 @@ def compress_submodule(name, subname, module, submodule): dim , fp16_col = 1024, 64 if "self_attn" in name: - U,S,V,outlier_U,outlier_V = decomposition(delta,dim=dim,name=name) + U,S,V,outlier_U,outlier_V = decomposition(delta,dim=dim,name=name,attn_outlier=args.attn_outlier) else: dim , fp16_col = 1024 , 128 # delta , scaled_p = solve_orthogonal(p, f) - U,S,V,outlier_U,outlier_V = decomposition(delta,dim=dim,name=name) + U,S,V,outlier_U,outlier_V = decomposition(delta,dim=dim,name=name,mlp_outlier=args.mlp_outlier) compressed_U, compressed_V = BinaryDiff(weight=U[:,fp16_col:]).to(f.device), BinaryDiff(weight=V[:,fp16_col:]).to(f.device) U_mask, U_coeff, V_mask, V_coeff = compressed_U.mask, compressed_U.coeff, compressed_V.mask, compressed_V.coeff @@ -237,7 +237,7 @@ def set_zero(A, B): return A -def decomposition(masked_input_tensor,dim=None,name=None): +def decomposition(masked_input_tensor,dim=None,name=None,attn_outlier=0.1,mlp_outlier=0.1): U , S , V = torch.svd(masked_input_tensor.to(torch.float32)) outlier_U , outlier_V = None, None @@ -246,16 +246,16 @@ def decomposition(masked_input_tensor,dim=None,name=None): U , S , V = U[:, :dim],S[:dim] ,V[:, :dim] if "self_attn" in name: - outlier_U = get_outlier(U[:,64:], percent=0.2) - outlier_V = get_outlier(V[:,64:], percent=0.2) + outlier_U = get_outlier(U[:,64:], percent=attn_outlier) + outlier_V = get_outlier(V[:,64:], percent=attn_outlier) set_zero(U[:,64:], outlier_U) # import pdb; pdb.set_trace() set_zero(V[:,64:], outlier_V) else: - outlier_U = get_outlier(U[:,128:], percent=0.1) - outlier_V = get_outlier(V[:,128:], percent=0.1) + outlier_U = get_outlier(U[:,128:], percent=mlp_outlier) + outlier_V = get_outlier(V[:,128:], percent=mlp_outlier) set_zero(U[:,128:], outlier_U) set_zero(V[:,128:], outlier_V) diff --git a/bitdelta/train2.py b/bitdelta/train2.py index e87ff5e..e2fed61 100644 --- a/bitdelta/train2.py +++ b/bitdelta/train2.py @@ -20,12 +20,12 @@ tokenizer = get_tokenizer(args.finetuned_model) with torch.no_grad(): - base_model = get_model(args.base_model, args.base_model_device, args.base_model_memory_map).to(torch.float32) - finetuned_model = get_model(args.finetuned_model, args.finetuned_model_device, args.finetuned_model_memory_map).to(torch.float32) + base_model = get_model(args.base_model, args.base_model_device, args.base_model_memory_map) + finetuned_model = get_model(args.finetuned_model, args.finetuned_model_device, args.finetuned_model_memory_map) finetuned_compressed_model = get_model(args.finetuned_model, args.finetuned_compressed_model_device, args.finetuned_compressed_model_memory_map) print(f"compressing diff...") -compress_diff(base_model, finetuned_model, finetuned_compressed_model,args.save_dir) +compress_diff(base_model, finetuned_model, finetuned_compressed_model,args.save_dir,args) tokenizer.save_pretrained(args.save_dir) diff --git a/bitdelta/utils.py b/bitdelta/utils.py index 1304239..265ba27 100644 --- a/bitdelta/utils.py +++ b/bitdelta/utils.py @@ -26,6 +26,8 @@ def parse_args(): parser.add_argument("--max_length", type=int, default=128) parser.add_argument("--save_dir", type=str, required=True) parser.add_argument("--train", action="store_true") + parser.add_argument("--attn_outlier", type=float,default=1e-4) + parser.add_argument("--mlp_outlier", type=float,default=1e-4) # device management parser.add_argument("--base_model_device", type=str, default="0") diff --git a/run.sh b/run.sh index 6eceba8..d9e5c0b 100644 --- a/run.sh +++ b/run.sh @@ -1,15 +1,22 @@ -MODEL_SAVE_DIR=save/uncalibrated_model +MODEL_SAVE_DIR=./../save/uncalibrated_llava mkdir -p $MODEL_SAVE_DIR +values=(0.05 0.2 0.4 0.5 0.75) + +# for value in ${values[@]} +# do CUDA_VISIBLE_DEVICES=5,6 python \ bitdelta/train2.py \ - --base_model /data/public/opensource_models/meta-llama/Llama-2-7b-hf/ \ - --finetuned_model /data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/ \ + --base_model /home/pingbowen/models/vicuna-13b-v1.5 \ + --finetuned_model /home/pingbowen/models/Llava-v1.5 \ --save_dir $MODEL_SAVE_DIR \ --batch_size 4 \ --num_steps 200 \ --save_full_model True \ + --attn_outlier 0.2 \ + --mlp_outlier 0.1 \ # &> test.log +# done # /data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/ From 53a0fc7a00cd5f4a159952c6005e1589b32b15a0 Mon Sep 17 00:00:00 2001 From: pingbowen Date: Sun, 31 Mar 2024 18:20:59 +0800 Subject: [PATCH 11/14] load llava --- bitdelta/diff2.py | 8 ++++++-- bitdelta/train2.py | 13 ++++++++----- bitdelta/utils.py | 40 +++++++++++++++++++++++++++++++++++++++- run.sh | 2 +- 4 files changed, 54 insertions(+), 9 deletions(-) diff --git a/bitdelta/diff2.py b/bitdelta/diff2.py index 8489ccc..9e587d0 100644 --- a/bitdelta/diff2.py +++ b/bitdelta/diff2.py @@ -102,7 +102,7 @@ def copy_nonzero_values(A, B): A[mask] = B[mask] return A -def compress_diff(base_model, finetuned_model, finetuned_compressed_model,save_dir,args): +def compress_diff(base_model, finetuned_model, save_dir,args): def compress_submodule(name, subname, module, submodule): target_device = submodule.weight.device @@ -121,7 +121,11 @@ def compress_submodule(name, subname, module, submodule): setattr(module, subname, compressed) # TODO: 根据thresh 选择压缩比例 - for name, module in finetuned_compressed_model.named_modules(): + for name, module in finetuned_model.named_modules(): + + if "vision" in name: + continue + if "self_attn" in name or "mlp" in name: for subname, submodule in module.named_children(): diff --git a/bitdelta/train2.py b/bitdelta/train2.py index e2fed61..47fda22 100644 --- a/bitdelta/train2.py +++ b/bitdelta/train2.py @@ -6,7 +6,7 @@ from bitdelta.diff2 import compress_diff, save_diff, save_full_model from bitdelta.misc import find_corr_stddev -from bitdelta.utils import get_model, parse_args, get_tokenizer +from bitdelta.utils import get_model, parse_args, get_tokenizer,load_llava from tqdm import tqdm from bitdelta.data import get_dataset, get_dataloader @@ -21,11 +21,14 @@ with torch.no_grad(): base_model = get_model(args.base_model, args.base_model_device, args.base_model_memory_map) - finetuned_model = get_model(args.finetuned_model, args.finetuned_model_device, args.finetuned_model_memory_map) + if "llava" in args.finetuned_model.lower(): + finetuned_model = load_llava(args.finetuned_model,device="cuda" if torch.cuda.is_available() else "cpu") + else: + finetuned_model = get_model(args.finetuned_model, args.finetuned_model_device, args.finetuned_model_memory_map) -finetuned_compressed_model = get_model(args.finetuned_model, args.finetuned_compressed_model_device, args.finetuned_compressed_model_memory_map) +import pdb;pdb.set_trace() print(f"compressing diff...") -compress_diff(base_model, finetuned_model, finetuned_compressed_model,args.save_dir,args) +compress_diff(base_model, finetuned_model, args.save_dir,args) -tokenizer.save_pretrained(args.save_dir) +tokenizer.save_pretrained(args.save_dir) \ No newline at end of file diff --git a/bitdelta/utils.py b/bitdelta/utils.py index 265ba27..c957552 100644 --- a/bitdelta/utils.py +++ b/bitdelta/utils.py @@ -1,8 +1,46 @@ import argparse import transformers import torch -from transformers import AutoConfig, AutoModelForCausalLM +from transformers import AutoConfig, AutoModelForCausalLM,AutoTokenizer from accelerate import infer_auto_device_map, init_empty_weights +import os +from llava.model import * +from llava.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN + +def load_llava(path,device): + tokenizer = AutoTokenizer.from_pretrained(path, use_fast=False) + model = LlavaLlamaForCausalLM.from_pretrained( + path, + low_cpu_mem_usage=True, + torch_dtype=torch.bfloat16, + ).to(device) + + + image_processor = None + + if 'llava' in path.lower(): + mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False) + mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True) + if mm_use_im_patch_token: + tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True) + if mm_use_im_start_end: + tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True) + model.resize_token_embeddings(len(tokenizer)) + + vision_tower = model.get_vision_tower() + if not vision_tower.is_loaded: + vision_tower.load_model(device_map=device) + if device != 'auto': + vision_tower.to(device=device, dtype=torch.float16) + image_processor = vision_tower.image_processor + + if hasattr(model.config, "max_sequence_length"): + context_len = model.config.max_sequence_length + else: + context_len = 2048 + + return model + def parse_args(): parser = argparse.ArgumentParser(description="BitDelta") diff --git a/run.sh b/run.sh index d9e5c0b..9c9f591 100644 --- a/run.sh +++ b/run.sh @@ -1,4 +1,4 @@ -MODEL_SAVE_DIR=./../save/uncalibrated_llava +MODEL_SAVE_DIR=./../save/llama_7b_chat_attn_mlp_outlier_0.2_0.1/ mkdir -p $MODEL_SAVE_DIR From 85ea71f64033bb1e7718ee92a7f25948469d14d2 Mon Sep 17 00:00:00 2001 From: pingbowen Date: Sun, 31 Mar 2024 19:32:39 +0800 Subject: [PATCH 12/14] support mix and low_bit --- bitdelta/diff2.py | 70 +++++++++++++++++----------------------------- bitdelta/train2.py | 2 +- bitdelta/utils.py | 2 +- run.sh | 9 ++++-- 4 files changed, 34 insertions(+), 49 deletions(-) diff --git a/bitdelta/diff2.py b/bitdelta/diff2.py index 9e587d0..7cf946f 100644 --- a/bitdelta/diff2.py +++ b/bitdelta/diff2.py @@ -135,53 +135,35 @@ def compress_submodule(name, subname, module, submodule): f = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device) delta , outlier_U, outlier_V = f - p , None, None - dim , fp16_col = 1024, 64 - if "self_attn" in name: - U,S,V,outlier_U,outlier_V = decomposition(delta,dim=dim,name=name,attn_outlier=args.attn_outlier) - else: - dim , fp16_col = 1024 , 128 - # delta , scaled_p = solve_orthogonal(p, f) - U,S,V,outlier_U,outlier_V = decomposition(delta,dim=dim,name=name,mlp_outlier=args.mlp_outlier) - - compressed_U, compressed_V = BinaryDiff(weight=U[:,fp16_col:]).to(f.device), BinaryDiff(weight=V[:,fp16_col:]).to(f.device) - U_mask, U_coeff, V_mask, V_coeff = compressed_U.mask, compressed_U.coeff, compressed_V.mask, compressed_V.coeff - weight_U , weight_V = (unpack(U_mask)*2-1) * U_coeff, (unpack(V_mask)*2-1) * V_coeff - U[:,fp16_col:] , V[:,fp16_col:] = weight_U.T, weight_V.T + if args.choice == "mix": + dim , fp16_col = 1024, 64 + + if "self_attn" in name: + U,S,V,outlier_U,outlier_V = decomposition(delta,dim=dim,name=name,attn_outlier=args.attn_outlier) + else: + dim , fp16_col = 1024 , 128 + # delta , scaled_p = solve_orthogonal(p, f) + U,S,V,outlier_U,outlier_V = decomposition(delta,dim=dim,name=name,mlp_outlier=args.mlp_outlier) + + compressed_U, compressed_V = BinaryDiff(weight=U[:,fp16_col:]).to(f.device), BinaryDiff(weight=V[:,fp16_col:]).to(f.device) + U_mask, U_coeff, V_mask, V_coeff = compressed_U.mask, compressed_U.coeff, compressed_V.mask, compressed_V.coeff + weight_U , weight_V = (unpack(U_mask)*2-1) * U_coeff, (unpack(V_mask)*2-1) * V_coeff + U[:,fp16_col:] , V[:,fp16_col:] = weight_U.T, weight_V.T - - if outlier_U is not None and outlier_V is not None: - copy_nonzero_values(U[:,fp16_col:], outlier_U) , copy_nonzero_values(V[:,fp16_col:], outlier_V) - # import pdb; pdb.set_trace() - - delta = U @ torch.diag(S) @ V.t() + + if outlier_U is not None and outlier_V is not None: + copy_nonzero_values(U[:,fp16_col:], outlier_U) , copy_nonzero_values(V[:,fp16_col:], outlier_V) + # import pdb; pdb.set_trace() + + delta = U @ torch.diag(S) @ V.t() + elif args.choice == "bit": + compressed = BinaryDiff(weight=delta).to(f.device) + mask , coeff = compressed.mask, compressed.coeff + delta = (unpack(mask)*2-1) * coeff + delta = delta.T + finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(p.to(p.dtype) + delta.to(p.dtype)) - - ''' - fp 16 + 1bit - - if "proj" in subname: - base_weight = base_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device) - finetuned_weight = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device) - dim , thresh = 1024,0.7 - - if "mlp" in name: - dim , thresh = 2048 , 0.24 - - U,S,V = decomposition(finetuned_weight - base_weight,dim=dim) - energy_total = torch.sum(S**2) - energy_top_percent = torch.sum(S[:50]**2) - ratio = energy_top_percent / energy_total - - compressed_U, compressed_V = BinaryDiff(weight=U[:,64:]).to(finetuned_weight.device), BinaryDiff(weight=V[:,64:]).to(finetuned_weight.device) - U_mask, U_coeff, V_mask, V_coeff = compressed_U.mask, compressed_U.coeff, compressed_V.mask, compressed_V.coeff - weight_U , weight_V = (unpack(U_mask)*2-1) * U_coeff, (unpack(V_mask)*2-1) * V_coeff - # import pdb; pdb.set_trace() - U[:,64:] , V[:,64:] = weight_U.T, weight_V.T # 不确定是否有bug - delta = U @ torch.diag(S) @ V.t() - with torch.no_grad(): - finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(base_weight + delta.to(base_weight.dtype)) - ''' # import pdb ; pdb.set_trace() finetuned_model.to(torch.bfloat16) finetuned_model.save_pretrained(save_dir) diff --git a/bitdelta/train2.py b/bitdelta/train2.py index 47fda22..bc2f7e5 100644 --- a/bitdelta/train2.py +++ b/bitdelta/train2.py @@ -27,7 +27,7 @@ finetuned_model = get_model(args.finetuned_model, args.finetuned_model_device, args.finetuned_model_memory_map) -import pdb;pdb.set_trace() +# import pdb;pdb.set_trace() print(f"compressing diff...") compress_diff(base_model, finetuned_model, args.save_dir,args) diff --git a/bitdelta/utils.py b/bitdelta/utils.py index c957552..18ea947 100644 --- a/bitdelta/utils.py +++ b/bitdelta/utils.py @@ -66,6 +66,7 @@ def parse_args(): parser.add_argument("--train", action="store_true") parser.add_argument("--attn_outlier", type=float,default=1e-4) parser.add_argument("--mlp_outlier", type=float,default=1e-4) + parser.add_argument("--choice", type=str,choices=['mix','bit','rank'],default=None) # device management parser.add_argument("--base_model_device", type=str, default="0") @@ -144,7 +145,6 @@ def get_model(model_name, device, memory_map=None): else: # single-gpu or cpu return transformers.AutoModelForCausalLM.from_pretrained( model_name, - # torch_dtype=torch.float16, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, ).to(device) diff --git a/run.sh b/run.sh index 9c9f591..42d5b26 100644 --- a/run.sh +++ b/run.sh @@ -1,4 +1,4 @@ -MODEL_SAVE_DIR=./../save/llama_7b_chat_attn_mlp_outlier_0.2_0.1/ +MODEL_SAVE_DIR=./../save/test mkdir -p $MODEL_SAVE_DIR @@ -8,15 +8,18 @@ values=(0.05 0.2 0.4 0.5 0.75) # do CUDA_VISIBLE_DEVICES=5,6 python \ bitdelta/train2.py \ - --base_model /home/pingbowen/models/vicuna-13b-v1.5 \ - --finetuned_model /home/pingbowen/models/Llava-v1.5 \ + --base_model /data/public/opensource_models/meta-llama/Llama-2-7b-hf/ \ + --finetuned_model /data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/ \ --save_dir $MODEL_SAVE_DIR \ --batch_size 4 \ --num_steps 200 \ --save_full_model True \ --attn_outlier 0.2 \ --mlp_outlier 0.1 \ + --choice bit # &> test.log # done # /data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/ + # /home/pingbowen/models/vicuna-13b-v1.5 , /home/pingbowen/models/Llava-v1.5 + # /data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/ From f93952c6d7477f4913b4ae5a7a7a400eabe6bdf3 Mon Sep 17 00:00:00 2001 From: pingbowen Date: Mon, 29 Apr 2024 16:38:34 +0800 Subject: [PATCH 13/14] support svd --- bitdelta/diff2.py | 51 ++++++++++---- bitdelta/train2.py | 1 - bitdelta/utils.py | 11 +-- run.sh | 28 +++++--- run_tailor.sh | 13 ++-- tailor.py | 163 ++++++++------------------------------------- test.py | 43 ++++++++---- 7 files changed, 126 insertions(+), 184 deletions(-) diff --git a/bitdelta/diff2.py b/bitdelta/diff2.py index 7cf946f..ab356d3 100644 --- a/bitdelta/diff2.py +++ b/bitdelta/diff2.py @@ -121,8 +121,8 @@ def compress_submodule(name, subname, module, submodule): setattr(module, subname, compressed) # TODO: 根据thresh 选择压缩比例 + param_dict = dict() for name, module in finetuned_model.named_modules(): - if "vision" in name: continue @@ -162,9 +162,32 @@ def compress_submodule(name, subname, module, submodule): mask , coeff = compressed.mask, compressed.coeff delta = (unpack(mask)*2-1) * coeff delta = delta.T + elif args.choice == "svd": + dim = 1024 + + if "mlp" in name: + dim = int(1024 * 1.45) + + U , S , V = decomposition((f - p).clone().detach(),dim=dim) + param_dict[f"{name}.{subname}" + ".base"] = p + param_dict[f"{name}.{subname}" + ".U"] = U.to(p.dtype) + param_dict[f"{name}.{subname}" + ".S"] = S.to(p.dtype) + param_dict[f"{name}.{subname}" + ".V"] = V.to(p.dtype) + # if "llava" in args.finetuned_model.lower(): + # U , S , V = decomposition((f - p).clone().detach(),dim=1024) + # param_dict[f"{name}.{subname}" + ".base"] = p + # param_dict[f"{name}.{subname}" + ".U"] = U.to(p.dtype) + # param_dict[f"{name}.{subname}" + ".S"] = S.to(p.dtype) + # param_dict[f"{name}.{subname}" + ".V"] = V.to(p.dtype) finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(p.to(p.dtype) + delta.to(p.dtype)) - # import pdb ; pdb.set_trace() + + # if "llava" in args.finetuned_model.lower(): + # torch.save(param_dict, "/home/pingbowen/workspace/delta-compression/saved_model/llava_svd.pt") + if args.choice == "svd": + torch.save(param_dict, args.svd_dict) + + finetuned_model.to(torch.bfloat16) finetuned_model.save_pretrained(save_dir) @@ -231,26 +254,26 @@ def decomposition(masked_input_tensor,dim=None,name=None,attn_outlier=0.1,mlp_ou if dim is not None: U , S , V = U[:, :dim],S[:dim] ,V[:, :dim] - if "self_attn" in name: - outlier_U = get_outlier(U[:,64:], percent=attn_outlier) - outlier_V = get_outlier(V[:,64:], percent=attn_outlier) + # if "self_attn" in name: + # outlier_U = get_outlier(U[:,64:], percent=attn_outlier) + # outlier_V = get_outlier(V[:,64:], percent=attn_outlier) - set_zero(U[:,64:], outlier_U) - # import pdb; pdb.set_trace() - set_zero(V[:,64:], outlier_V) + # set_zero(U[:,64:], outlier_U) + # # import pdb; pdb.set_trace() + # set_zero(V[:,64:], outlier_V) - else: - outlier_U = get_outlier(U[:,128:], percent=mlp_outlier) - outlier_V = get_outlier(V[:,128:], percent=mlp_outlier) + # else: + # outlier_U = get_outlier(U[:,128:], percent=mlp_outlier) + # outlier_V = get_outlier(V[:,128:], percent=mlp_outlier) - set_zero(U[:,128:], outlier_U) - set_zero(V[:,128:], outlier_V) + # set_zero(U[:,128:], outlier_U) + # set_zero(V[:,128:], outlier_V) # max_val, min_val, mean_abs_val = round(torch.max(U).item(),4), round(torch.min(U).item(),4), round(torch.mean(torch.abs(U)).item(),4) # print(f"max_val {max_val} pos_min {round(torch.min(outlier[outlier > 0]).item(),4)} mean_abs_val {mean_abs_val} ratio {round(torch.min(outlier[outlier > 0]).item() / mean_abs_val,4)}") # import pdb; pdb.set_trace() - return U, S, V , outlier_U, outlier_V + return U, S, V # , outlier_U, outlier_V def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, device,layers=None,ori_diff=None): base_model = get_model(base_model_name, device) diff --git a/bitdelta/train2.py b/bitdelta/train2.py index bc2f7e5..7073a57 100644 --- a/bitdelta/train2.py +++ b/bitdelta/train2.py @@ -27,7 +27,6 @@ finetuned_model = get_model(args.finetuned_model, args.finetuned_model_device, args.finetuned_model_memory_map) -# import pdb;pdb.set_trace() print(f"compressing diff...") compress_diff(base_model, finetuned_model, args.save_dir,args) diff --git a/bitdelta/utils.py b/bitdelta/utils.py index 18ea947..5e59508 100644 --- a/bitdelta/utils.py +++ b/bitdelta/utils.py @@ -4,8 +4,10 @@ from transformers import AutoConfig, AutoModelForCausalLM,AutoTokenizer from accelerate import infer_auto_device_map, init_empty_weights import os -from llava.model import * -from llava.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN +try: + from llava.model import * + from llava.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN +except: pass def load_llava(path,device): tokenizer = AutoTokenizer.from_pretrained(path, use_fast=False) @@ -44,7 +46,7 @@ def load_llava(path,device): def parse_args(): parser = argparse.ArgumentParser(description="BitDelta") - + # # models parser.add_argument( "--finetuned_model", type=str, default="lmsys/vicuna-7b-v1.5-16k" @@ -52,6 +54,7 @@ def parse_args(): parser.add_argument("--base_model", type=str, default="meta-llama/Llama-2-7b-hf") # train params + parser.add_argument("--svd_dict", type=str, default="") parser.add_argument("--dataset_name", type=str, default="c4") parser.add_argument("--subset", type=str, default="en") parser.add_argument("--data_dir", type=str, default="en") @@ -66,7 +69,7 @@ def parse_args(): parser.add_argument("--train", action="store_true") parser.add_argument("--attn_outlier", type=float,default=1e-4) parser.add_argument("--mlp_outlier", type=float,default=1e-4) - parser.add_argument("--choice", type=str,choices=['mix','bit','rank'],default=None) + parser.add_argument("--choice", type=str,choices=['mix','bit','svd'],default=None) # device management parser.add_argument("--base_model_device", type=str, default="0") diff --git a/run.sh b/run.sh index 42d5b26..9fe3cbf 100644 --- a/run.sh +++ b/run.sh @@ -1,25 +1,33 @@ -MODEL_SAVE_DIR=./../save/test +MODEL_SAVE_DIR=/home/pingbowen/workspace/delta-compression/save/test mkdir -p $MODEL_SAVE_DIR values=(0.05 0.2 0.4 0.5 0.75) -# for value in ${values[@]} -# do -CUDA_VISIBLE_DEVICES=5,6 python \ +pretrained_model=(/data/public/opensource_models/codellama/codellama-7b-python-hf/ /data/public/opensource_models/meta-llama/Llama-2-7b-hf/) +finetuned_model=(/data/groups/QY_LLM_Other/OSS_Code_LLM/Magicoder-S-CL-7B/ /data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/) +svd_dict=(/home/pingbowen/workspace/delta-compression/saved_model/magicoder_svd.pt /home/pingbowen/workspace/delta-compression/saved_model/llama_chat_svd.pt) +for (( i=0; i<2; i++ )); do + +gpu0=$((2 * i)) +gpu1=$((2 * i + 1)) + +CUDA_VISIBLE_DEVICES="$gpu0,$gpu1" python \ bitdelta/train2.py \ - --base_model /data/public/opensource_models/meta-llama/Llama-2-7b-hf/ \ - --finetuned_model /data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/ \ + --base_model ${pretrained_model[$i]} \ + --finetuned_model ${finetuned_model[$i]} \ --save_dir $MODEL_SAVE_DIR \ --batch_size 4 \ --num_steps 200 \ --save_full_model True \ --attn_outlier 0.2 \ --mlp_outlier 0.1 \ - --choice bit + --svd_dict ${svd_dict[$i]} \ + --choice svd & # &> test.log -# done - - # /data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/ +done +wait + # /data/public/opensource_models/codellama/codellama-7b-python-hf/ /data/groups/QY_LLM_Other/OSS_Code_LLM/Magicoder-S-CL-7B/ # /home/pingbowen/models/vicuna-13b-v1.5 , /home/pingbowen/models/Llava-v1.5 # /data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/ + # /data/public/opensource_models/meta-llama/Llama-2-7b-hf/ /data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/ diff --git a/run_tailor.sh b/run_tailor.sh index 71b7393..5149cfc 100644 --- a/run_tailor.sh +++ b/run_tailor.sh @@ -1,11 +1,14 @@ -python \ - tailor.py \ - --finetuned_model_name /data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf \ - --save_dir /home/pingbowen/workspace/delta-compression/BitDelta/tailor_model/7b_chat \ +CUDA_VISIBLE_DEVICES=2,3 python tailor.py \ + --pretrained_model_name /data/public/opensource_models/meta-llama/Llama-2-7b-hf/ \ + --finetuned_model_name /data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/\ + --dim 128 \ + --scale_factor 1.45 \ + --save_dir /home/pingbowen/save/Llama-2-7b-chat_svd # & - +# /data/public/opensource_models/codellama/codellama-7b-python-hf/ +# /data/groups/QY_LLM_Other/OSS_Code_LLM/Magicoder-S-CL-7B/ # python3 tailor.py \ # --finetuned_model_name /data/public/wangshuo/exp/ft-en-metameth-llama-2-7b/ckpts/checkpoints/epoch_2_hf \ # --save_dir /home/pingbowen/workspace/delta-compression/BitDelta/tailor_model/math_lora_7b \ \ No newline at end of file diff --git a/tailor.py b/tailor.py index 270bdaa..74d2d04 100755 --- a/tailor.py +++ b/tailor.py @@ -15,22 +15,26 @@ import re import random import numpy as np -import math parser = argparse.ArgumentParser() -parser.add_argument('--finetuned_model_name', type=str, required=True, help='finetuned model name') -parser.add_argument('--save_dir', type=str, required=True, help='finetuned model name') +parser.add_argument('--pretrained_model_name', type=str, help='pretrained model name') +parser.add_argument('--finetuned_model_name', type=str, help='finetuned model name') +parser.add_argument('--save_dir', type=str, help='finetuned model name') +parser.add_argument('--dim', type=int, help='finetuned model name') +parser.add_argument('--scale_factor', type=float, default=1.45, help='finetuned model name') args = parser.parse_args() -pretrained_model_name = "/data/public/opensource_models/meta-llama/Llama-2-7b-hf" +device = "cuda" if torch.cuda.is_available() else "cpu" -finetuned_model_name = args.finetuned_model_name # /data/public/wangshuo/exp/ft-en-magicoder-llama-2-7b/ckpts/checkpoints/epoch_2_hf -pretrained_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=pretrained_model_name, - device_map="cpu") +pretrained_model_name = args.pretrained_model_name + +finetuned_model_name = args.finetuned_model_name +pretrained_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=pretrained_model_name,torch_dtype=torch.bfloat16).to(device) pretrained_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=pretrained_model_name) -finetuned_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=finetuned_model_name, - device_map="cpu") + +finetuned_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=finetuned_model_name,torch_dtype=torch.bfloat16).to(device) finetuned_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=finetuned_model_name) + def set_random_seed(seed: int = 0): """ set random seed @@ -46,145 +50,30 @@ def set_random_seed(seed: int = 0): torch.backends.cudnn.benchmark = False set_random_seed(seed=0) -# scale_factor = finetuned_model.config.intermediate_size / finetuned_model.config.hidden_size - - -scale_factor = 1.45 -def get_param_names_to_merge(input_param_names: list, exclude_param_names_regex: list): - """ - get the names of parameters that need to be merged - :param input_param_names: list, names of input parameters - :param exclude_param_names_regex: list, regular expression of names of parameters that need to be excluded - :return: - """ - param_names_to_merge = [] - for param_name in input_param_names: - exclude = any([re.match(exclude_pattern, param_name) for exclude_pattern in exclude_param_names_regex]) - if not exclude: - param_names_to_merge.append(param_name) - return param_names_to_merge - - -# import pdb -# pdb.set_trace() +scale_factor = args.scale_factor def decomposition(masked_input_tensor,dim): - U , S , V = torch.svd(masked_input_tensor) + U , S , V = torch.svd(masked_input_tensor.to(torch.float32)) U , S , V = U[:, :dim],S[:dim],V[:, :dim] # return torch.mm(U, torch.diag(S)), V.t() - return torch.mm(U, torch.mm(torch.diag(S), V.t())) #return lora_B, lora_A + return U @ torch.diag(S) @ V.t() #return lora_B, lora_A -# dim = 1024 -dim = 128 -# dim = 64 -print("----------------------dim: ",dim) -print("----------------------dim: ",dim) -print("----------------------dim: ",dim) -print("----------------------dim: ",dim) -print("----------------------dim: ",dim) -print("----------------------dim: ",dim) -peft_dict = {} -malign_dict = {} -other_dict = {} - -task_vector_param_dict = {} -pretrained_param_dict = {param_name: param_value for param_name, param_value in pretrained_model.named_parameters()} -finetuned_param_dict = {param_name: param_value for param_name, param_value in finetuned_model.named_parameters()} -param_names_to_merge = get_param_names_to_merge(input_param_names=list(pretrained_param_dict.keys()), exclude_param_names_regex=[]) with torch.no_grad(): - for param_name in param_names_to_merge: - if "self_attn" in param_name or "mlp" in param_name: - # import pdb ;pdb.set_trace() - if "mlp" in param_name: - dim = math.ceil(dim * scale_factor) - - delta = decomposition(finetuned_param_dict[param_name] - pretrained_param_dict[param_name],dim=dim) - finetuned_model.get_submodule(param_name.replace(".weight", "")).weight.copy_(pretrained_model.get_submodule(param_name.replace(".weight", "")).weight + delta) - # print(f"name {param_name} data {task_vector_param_dict[param_name]} ") - + for k,v in finetuned_model.state_dict().items(): + dim = args.dim + if ".weight" in k: + if "self_attn" in k or "mlp" in k: + if "mlp" in k: + dim = int(dim * scale_factor) + p = pretrained_model.get_submodule(k.replace(".weight", "")).weight + delta = decomposition(v - p,dim).to(v.dtype) + # import pdb; pdb.set_trace() + finetuned_model.get_submodule(k.replace(".weight", "")).weight.copy_(p + delta) finetuned_model.save_pretrained(save_directory=args.save_dir) finetuned_tokenizer.save_pretrained(save_directory=args.save_dir) -# for param_name, param_value in tqdm(task_vector_param_dict.items()): -# if "self_attn" in param_name or "mlp" in param_name: -# lora_B, lora_A = decomposition(param_value,dim=dim) -# lora_A = lora_A * (dim/16) ###补偿scaling, 以后的alpha可以统一为16 -# peft_key = "base_model.model." + param_name.split(".weight")[0] -# print(peft_key+".lora_A.weight") -# peft_dict[peft_key+".lora_A.weight"] = lora_A.contiguous() -# peft_dict[peft_key+".lora_B.weight"] = lora_B.contiguous() - - -# other_dict = {k: v.to(torch.float16) for k, v in other_dict.items()} - -# other_para_path = "/home/wanghanqing/projects/exp/mAlign_exp/lang_LoRAs/peft_ver/trim_lora/code/other_param" -# torch.save(other_dict, os.path.join(other_para_path, "other.pt")) -# torch.save(other_dict, os.path.join(other_para_path, "pretrain_other.pt")) - - -peft_dict = {k: v.to(torch.float16) for k, v in peft_dict.items()} - -# layernum = 40 -# for lnum in range(layernum): -# peft_pfx = f"base_model.model.model.layers.{lnum}" -# delta_pfx = f"encoder.layers.{lnum}" -# malign_dict[f"{delta_pfx}.self_att.self_attention.project_q_lora.lora_A.weight"] = peft_dict[f"{peft_pfx}.self_attn.q_proj.lora_A.weight"].contiguous() -# malign_dict[f"{delta_pfx}.self_att.self_attention.project_q_lora.lora_B.weight"] = peft_dict[f"{peft_pfx}.self_attn.q_proj.lora_B.weight"].contiguous() -# malign_dict[f"{delta_pfx}.self_att.self_attention.project_k_lora.lora_A.weight"] = peft_dict[f"{peft_pfx}.self_attn.k_proj.lora_A.weight"].contiguous() -# malign_dict[f"{delta_pfx}.self_att.self_attention.project_k_lora.lora_B.weight"] = peft_dict[f"{peft_pfx}.self_attn.k_proj.lora_B.weight"].contiguous() -# malign_dict[f"{delta_pfx}.self_att.self_attention.project_v_lora.lora_A.weight"] = peft_dict[f"{peft_pfx}.self_attn.v_proj.lora_A.weight"].contiguous() -# malign_dict[f"{delta_pfx}.self_att.self_attention.project_v_lora.lora_B.weight"] = peft_dict[f"{peft_pfx}.self_attn.v_proj.lora_B.weight"].contiguous() -# malign_dict[f"{delta_pfx}.self_att.self_attention.attention_out_lora.lora_A.weight"] = peft_dict[f"{peft_pfx}.self_attn.o_proj.lora_A.weight"].contiguous() -# malign_dict[f"{delta_pfx}.self_att.self_attention.attention_out_lora.lora_B.weight"] = peft_dict[f"{peft_pfx}.self_attn.o_proj.lora_B.weight"].contiguous() -# malign_dict[f"{delta_pfx}.ffn.ffn.w_in.w_0_lora.lora_A.weight"] = peft_dict[f"{peft_pfx}.mlp.gate_proj.lora_A.weight"].contiguous() -# malign_dict[f"{delta_pfx}.ffn.ffn.w_in.w_0_lora.lora_B.weight"] = peft_dict[f"{peft_pfx}.mlp.gate_proj.lora_B.weight"].contiguous() -# malign_dict[f"{delta_pfx}.ffn.ffn.w_in.w_1_lora.lora_A.weight"] = peft_dict[f"{peft_pfx}.mlp.up_proj.lora_A.weight"].contiguous() -# malign_dict[f"{delta_pfx}.ffn.ffn.w_in.w_1_lora.lora_B.weight"] = peft_dict[f"{peft_pfx}.mlp.up_proj.lora_B.weight"].contiguous() -# malign_dict[f"{delta_pfx}.ffn.ffn.w_out_lora.lora_A.weight"] = peft_dict[f"{peft_pfx}.mlp.down_proj.lora_A.weight"].contiguous() -# malign_dict[f"{delta_pfx}.ffn.ffn.w_out_lora.lora_B.weight"] = peft_dict[f"{peft_pfx}.mlp.down_proj.lora_B.weight"].contiguous() - - - - - -malign_dict = {k: v.to(torch.float16) for k, v in malign_dict.items()} - -# import pdb -# pdb.set_trace() - -output_peft_path = "/home/wanghanqing/projects/exp/mAlign_exp/lang_LoRAs/peft_ver/trim_lora/dim256_2/code" -output_malign_path = "/home/wanghanqing/projects/exp/mAlign_exp/mAlign_LoRAs/trim_lora/dim256_2/code" - -# torch.save(peft_dict, os.path.join(output_peft_path, "adapter_model.bin")) -# torch.save(malign_dict, os.path.join(output_malign_path, "lora.pt")) - print("--end--") - - - - - -# num , masked_input_tensor = 0,input_tensor -# if "self_attn" in param_name or "mlp" in param_name: -# if "mlp" in param_name: -# dim = math.ceil(dim * scale_factor) -# thresh_hold = 0.06752 -# num, masked_input_tensor = decomposition(input_tensor,dim=dim) - - - - - -# for param_name, param_value in finetuned_model.named_parameters(): -# if param_name in masked_param_dict: -# param_value.data.copy_(masked_param_dict[param_name]) - -# logger.info(f"saving model at {save_model_path}...") -# os.makedirs(save_model_path, exist_ok=True) -# finetuned_model.save_pretrained(save_directory=save_model_path) -# finetuned_tokenizer.save_pretrained(save_directory=save_model_path) -# logger.info(f"model is saved") \ No newline at end of file diff --git a/test.py b/test.py index ab02ab3..500e631 100644 --- a/test.py +++ b/test.py @@ -133,19 +133,36 @@ def copy_nonzero_values(A, B): A[mask] = B[mask] return A - -# 示例 -n = 4 -A = torch.randn(n, n) # 随机生成一个n × n的张量A -B = torch.zeros(n, n) # 创建一个n × n的全零张量B -A = A.flatten() -values , top_indices = torch.topk(A, 1, largest=True) - - -import pdb; pdb.set_trace() -# params = base_model.state_dict() - -# print(params.keys()) +def load_svd(model): + param_dict = torch.load("/home/pingbowen/workspace/delta-compression/saved_model/llava_svd.pt") + + with torch.no_grad(): + for k,v in param_dict.items(): + if "base" in k: + dim = args.dim + + if "mlp" in k: + dim = int(dim * 1.45) + + k = k.replace(".base", "") + + U = param_dict[k + ".U"][:, :dim] + S = param_dict[k + ".S"][:dim] + V = param_dict[k + ".V"][:, :dim] + # import pdb; pdb.set_trace() + model.get_submodule(k).weight.copy_(v + U @ torch.diag(S) @ V.t()) + +parser = argparse.ArgumentParser(description="BitDelta") +parser.add_argument("--dim", type=int, default=128) +args = parser.parse_args() + +tokenizer = AutoTokenizer.from_pretrained("/data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/") +model = AutoModelForCausalLM.from_pretrained("/data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/", low_cpu_mem_usage=True, torch_dtype=torch.bfloat16) + +load_svd(model) + +tokenizer.save_pretrained(f"/home/pingbowen/workspace/delta-compression/save/Llama-chat-svd_{args.dim}/") +model.save_pretrained(f"/home/pingbowen/workspace/delta-compression/save/Llama-chat-svd_{args.dim}/") # get_tokenizer("/data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/") # save_full_model("/data/public/opensource_models/meta-llama/Llama-2-7b-hf/", "/data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/", os.path.join("/home/pingbowen/workspace/delta-compression/BitDelta/save", "diff_untrained.pt"), os.path.join("/home/pingbowen/workspace/delta-compression/BitDelta/save", "uncalibrated_model"), device="cuda") \ No newline at end of file From 29f7485e56c20ce32be5f59a5924a4bb2e939e14 Mon Sep 17 00:00:00 2001 From: pingbowen Date: Sat, 11 May 2024 10:40:19 +0800 Subject: [PATCH 14/14] support plot et al. --- Plot.py | 28 ++++++++++++++++++++++++++++ bitdelta/diff2.py | 14 ++++++++++---- bitdelta/utils.py | 2 ++ run.sh | 34 +++++++++++++++++++++++----------- run_tailor.sh | 29 +++++++++++++++++++++++------ tailor.py | 36 +++++++++++++++++++++++++----------- test.py | 16 +++++++++------- 7 files changed, 120 insertions(+), 39 deletions(-) create mode 100644 Plot.py diff --git a/Plot.py b/Plot.py new file mode 100644 index 0000000..a6d550d --- /dev/null +++ b/Plot.py @@ -0,0 +1,28 @@ +import matplotlib.pyplot as plt +import numpy as np +import torch +import argparse +def plot_bit_delta(title): + plt.figure(figsize=(10, 5)) + plt.plot(bit_delta, label=f'Bit-Delta {map[args.param_type]}') + plt.plot(svd_delta, label=f'svd Data {map[args.param_type]}') + plt.plot(mix_delta, label=f'Ours {map[args.param_type]}') + plt.title("Comparison of the Cosine Similarity between the Bit-Delta, SVD, and our method with WizardMath-7B-v1.0") + plt.xlabel(f'{map[args.param_type]} of each layer') # X轴标题 + plt.ylabel('Cosine Similarity Value') # Y轴标题 + plt.legend() + plt.savefig(f'./figures/{map[args.param_type]}_cos_sim.pdf') + plt.show() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--param_type', type=str, help='finetuned model name') + map = {"q_proj":"Query_proj", "k_proj":"Key_proj","v_proj":"Value_proj","o_proj":"Output_proj","gate_proj":"Gate_proj","up_proj":"Up_proj","down_proj":"Down_proj"} + args = parser.parse_args() + + bit_delta = torch.load(f'./statistic/{args.param_type}_bitdelta_cos_sim.pt') + svd_delta = torch.load(f'./statistic/{args.param_type}_svd_cos_sim.pt') + mix_delta = torch.load(f'./statistic/{args.param_type}_mix_cos_sim.pt') + + plot_bit_delta('Cosine Similarity of Bit-Delta, svd and mixed Data') \ No newline at end of file diff --git a/bitdelta/diff2.py b/bitdelta/diff2.py index ab356d3..1afabdc 100644 --- a/bitdelta/diff2.py +++ b/bitdelta/diff2.py @@ -123,6 +123,7 @@ def compress_submodule(name, subname, module, submodule): # TODO: 根据thresh 选择压缩比例 param_dict = dict() for name, module in finetuned_model.named_modules(): + # import pdb; pdb.set_trace() if "vision" in name: continue @@ -162,11 +163,15 @@ def compress_submodule(name, subname, module, submodule): mask , coeff = compressed.mask, compressed.coeff delta = (unpack(mask)*2-1) * coeff delta = delta.T + + if "llava" in args.finetuned_model.lower(): + param_dict[f"{name}.{subname}" + ".weight"] = p + delta.to(p.dtype) + elif args.choice == "svd": - dim = 1024 + dim = args.dim if "mlp" in name: - dim = int(1024 * 1.45) + dim = int(dim * args.scale_factor) U , S , V = decomposition((f - p).clone().detach(),dim=dim) param_dict[f"{name}.{subname}" + ".base"] = p @@ -182,8 +187,9 @@ def compress_submodule(name, subname, module, submodule): finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(p.to(p.dtype) + delta.to(p.dtype)) - # if "llava" in args.finetuned_model.lower(): - # torch.save(param_dict, "/home/pingbowen/workspace/delta-compression/saved_model/llava_svd.pt") + if "llava" in args.finetuned_model.lower() and args.choice == "bit": + torch.save(param_dict, args.svd_dict) + if args.choice == "svd": torch.save(param_dict, args.svd_dict) diff --git a/bitdelta/utils.py b/bitdelta/utils.py index 5e59508..3fff503 100644 --- a/bitdelta/utils.py +++ b/bitdelta/utils.py @@ -61,6 +61,8 @@ def parse_args(): parser.add_argument("--split", type=str, default="train") parser.add_argument("--lr", type=float, default=1e-4) parser.add_argument("--num_steps", type=int, default=100) + parser.add_argument("--dim", type=int, default=1024) + parser.add_argument("--scale_factor", type=float, default=1.45) parser.add_argument("--batch_size", type=int, default=4) parser.add_argument("--layers", nargs='+', default=None) parser.add_argument("--save_num", type=int, default=0) diff --git a/run.sh b/run.sh index 9fe3cbf..37ed843 100644 --- a/run.sh +++ b/run.sh @@ -1,30 +1,42 @@ -MODEL_SAVE_DIR=/home/pingbowen/workspace/delta-compression/save/test +MODEL_SAVE_DIR=/home/pingbowen/workspace/delta-compression/save/mistral-v0.2_bitdelta mkdir -p $MODEL_SAVE_DIR values=(0.05 0.2 0.4 0.5 0.75) -pretrained_model=(/data/public/opensource_models/codellama/codellama-7b-python-hf/ /data/public/opensource_models/meta-llama/Llama-2-7b-hf/) -finetuned_model=(/data/groups/QY_LLM_Other/OSS_Code_LLM/Magicoder-S-CL-7B/ /data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/) -svd_dict=(/home/pingbowen/workspace/delta-compression/saved_model/magicoder_svd.pt /home/pingbowen/workspace/delta-compression/saved_model/llama_chat_svd.pt) +pretrained_model=(/data/public/opensource_models/meta-llama/Llama-2-7b-hf/ ) +finetuned_model=(/data/public/wangshuo/exp/ft-en-magicoder-llama-2-7b/ckpts/checkpoints/epoch_2_hf +) +svd_dict=(/data/groups/QY_LLM_Other/pingbowen/models/codelora/codelora_svd.pt / ) +save_dir=(/home/pingbowen/workspace/delta-compression/save/test /data/groups/QY_LLM_Other/pingbowen/models/codelora/codelora_bitdelta/) + for (( i=0; i<2; i++ )); do +# choice="svd" +if [ $i -eq 0 ]; then + choice="svd" +else + choice="bit" +fi + gpu0=$((2 * i)) gpu1=$((2 * i + 1)) - -CUDA_VISIBLE_DEVICES="$gpu0,$gpu1" python \ +# "$gpu0,$gpu1" +CUDA_VISIBLE_DEVICES=$((i + 1)) python \ bitdelta/train2.py \ - --base_model ${pretrained_model[$i]} \ - --finetuned_model ${finetuned_model[$i]} \ - --save_dir $MODEL_SAVE_DIR \ + --base_model ${pretrained_model[0]} \ + --finetuned_model ${finetuned_model[0]} \ + --save_dir ${save_dir[$i]} \ --batch_size 4 \ --num_steps 200 \ --save_full_model True \ --attn_outlier 0.2 \ --mlp_outlier 0.1 \ --svd_dict ${svd_dict[$i]} \ - --choice svd & - # &> test.log + --dim 1024 \ + --scale_factor 1.46 \ + --choice $choice & + # &> test.log # ${save_dir[$i]} done wait # /data/public/opensource_models/codellama/codellama-7b-python-hf/ /data/groups/QY_LLM_Other/OSS_Code_LLM/Magicoder-S-CL-7B/ diff --git a/run_tailor.sh b/run_tailor.sh index 5149cfc..a3ff799 100644 --- a/run_tailor.sh +++ b/run_tailor.sh @@ -1,9 +1,26 @@ -CUDA_VISIBLE_DEVICES=2,3 python tailor.py \ - --pretrained_model_name /data/public/opensource_models/meta-llama/Llama-2-7b-hf/ \ - --finetuned_model_name /data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/\ - --dim 128 \ - --scale_factor 1.45 \ - --save_dir /home/pingbowen/save/Llama-2-7b-chat_svd +pretrained_model=(/data/public/opensource_models/meta-llama/Llama-2-7b-hf/ /data/public/opensource_models/meta-llama/Llama-2-7b-hf/ /data/public/opensource_models/codellama/codellama-7b-python-hf/ /home/pingbowen/models/vicuna-7b-v1.5) +finetuned_model=(/data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/ /data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/ /data/groups/QY_LLM_Other/OSS_Code_LLM/Magicoder-S-CL-7B/ /home/pingbowen/models/llava-v1.5-7b) +finetuned_compressed_model=(/home/pingbowen/workspace/delta-compression/saved_model/WizardMath-7B-V1.0_bitdelta/ /data/groups/QY_LLM_Other/pingbowen/models/wizardmath/WizardMath_svd/ /data/groups/QY_LLM_Other/pingbowen/models/wizardmath/delta_1024_mix_32_8_3_2_full/) +param_types=(q_proj k_proj v_proj o_proj gate_proj up_proj down_proj) +model_types=(svd bitdelta mix) + +for (( j=0; j<${#param_types[@]}; j++ )); do + CUDA_VISIBLE_DEVICES=1 python3 Plot.py --param_type ${param_types[$j]} +done +# for i in {0..2} +# do +# for (( j=0; j<${#param_types[@]}; j++ )); do +# CUDA_VISIBLE_DEVICES=1,7 python tailor.py \ +# --pretrained_model_name ${pretrained_model[0]} \ +# --finetuned_model_name ${finetuned_model[0]} \ +# --finetuned_compressed_model ${finetuned_compressed_model[$i]} \ +# --dim 128 \ +# --scale_factor 1.45 \ +# --param_type ${param_types[$j]} \ +# --model_type ${model_types[$i]} \ +# --save_dir ./statistic/ +# done +# done # & diff --git a/tailor.py b/tailor.py index 74d2d04..d1f7f88 100755 --- a/tailor.py +++ b/tailor.py @@ -15,11 +15,15 @@ import re import random import numpy as np +import torch.nn.functional as F parser = argparse.ArgumentParser() parser.add_argument('--pretrained_model_name', type=str, help='pretrained model name') parser.add_argument('--finetuned_model_name', type=str, help='finetuned model name') +parser.add_argument('--finetuned_compressed_model', type=str, help='finetuned model name') parser.add_argument('--save_dir', type=str, help='finetuned model name') +parser.add_argument('--param_type', type=str, help='finetuned model name') +parser.add_argument('--model_type', type=str, help='finetuned model name') parser.add_argument('--dim', type=int, help='finetuned model name') parser.add_argument('--scale_factor', type=float, default=1.45, help='finetuned model name') args = parser.parse_args() @@ -35,6 +39,7 @@ finetuned_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=finetuned_model_name,torch_dtype=torch.bfloat16).to(device) finetuned_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=finetuned_model_name) +finetuned_compressed_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=args.finetuned_compressed_model,torch_dtype=torch.bfloat16).to(device) def set_random_seed(seed: int = 0): """ set random seed @@ -59,21 +64,30 @@ def decomposition(masked_input_tensor,dim): # return torch.mm(U, torch.diag(S)), V.t() return U @ torch.diag(S) @ V.t() #return lora_B, lora_A +L2_norm_total,L1_norm_total, mag, num = 0, 0 , 0 ,0 + +l2_norm ,cos_sim = [],[] with torch.no_grad(): - for k,v in finetuned_model.state_dict().items(): + for k,v in finetuned_compressed_model.state_dict().items(): dim = args.dim - if ".weight" in k: - if "self_attn" in k or "mlp" in k: - if "mlp" in k: - dim = int(dim * scale_factor) - p = pretrained_model.get_submodule(k.replace(".weight", "")).weight - delta = decomposition(v - p,dim).to(v.dtype) - # import pdb; pdb.set_trace() - finetuned_model.get_submodule(k.replace(".weight", "")).weight.copy_(p + delta) + if args.param_type in k : # or "mlp" in k + # if "mlp" in k: + # dim = int(dim * scale_factor) + p = pretrained_model.get_submodule(k.replace(".weight", "")).weight + f = finetuned_model.get_submodule(k.replace(".weight", "")).weight + delta , compressed_delta = f - p, v - p + # l2_norm.append(torch.norm(delta - compressed_delta,2).item()) + cos_sim.append(torch.mean(F.cosine_similarity(delta, compressed_delta, dim=0),dim=0).item()) + # L2_norm_total ,L1_norm_total,cos_sim, mag = L2_norm_total + torch.norm(torch.abs(delta) - torch.abs(compressed_delta),2).data, L1_norm_total + torch.norm(torch.abs(delta) - torch.abs(compressed_delta),1).data, cos_sim + F.cosine_similarity(delta, compressed_delta, dim=0), mag + torch.sum(torch.abs(compressed_delta)).data + # num += 1 -finetuned_model.save_pretrained(save_directory=args.save_dir) -finetuned_tokenizer.save_pretrained(save_directory=args.save_dir) +print("cos_sim:", cos_sim) +torch.save(cos_sim, os.path.join(args.save_dir, f"{args.param_type}_{args.model_type}_cos_sim.pt")) +# print("cos_sim_ave:", cos_sim / num) +# print("mag_ave:", mag_ave) +# finetuned_model.save_pretrained(save_directory=args.save_dir) +# finetuned_tokenizer.save_pretrained(save_directory=args.save_dir) print("--end--") diff --git a/test.py b/test.py index 500e631..55b59a9 100644 --- a/test.py +++ b/test.py @@ -134,15 +134,15 @@ def copy_nonzero_values(A, B): return A def load_svd(model): - param_dict = torch.load("/home/pingbowen/workspace/delta-compression/saved_model/llava_svd.pt") - + param_dict = torch.load(args.svd_dict) + # import pdb; pdb.set_trace() with torch.no_grad(): for k,v in param_dict.items(): if "base" in k: dim = args.dim if "mlp" in k: - dim = int(dim * 1.45) + dim = int(dim * args.scale_factor) k = k.replace(".base", "") @@ -154,15 +154,17 @@ def load_svd(model): parser = argparse.ArgumentParser(description="BitDelta") parser.add_argument("--dim", type=int, default=128) +parser.add_argument("--scale_factor", type=float, default=1.45) +parser.add_argument("--svd_dict", type=str, default="") args = parser.parse_args() -tokenizer = AutoTokenizer.from_pretrained("/data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/") -model = AutoModelForCausalLM.from_pretrained("/data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/", low_cpu_mem_usage=True, torch_dtype=torch.bfloat16) +tokenizer = AutoTokenizer.from_pretrained("/data/public/wangshuo/exp/ft-en-metameth-llama-2-7b/ckpts/checkpoints/epoch_2_hf") +model = AutoModelForCausalLM.from_pretrained("/data/public/wangshuo/exp/ft-en-metameth-llama-2-7b/ckpts/checkpoints/epoch_2_hf", torch_dtype=torch.bfloat16) # low_cpu_mem_usage=True load_svd(model) -tokenizer.save_pretrained(f"/home/pingbowen/workspace/delta-compression/save/Llama-chat-svd_{args.dim}/") -model.save_pretrained(f"/home/pingbowen/workspace/delta-compression/save/Llama-chat-svd_{args.dim}/") +tokenizer.save_pretrained(f"/data/groups/QY_LLM_Other/pingbowen/models/mathlora/math_svd/") +model.save_pretrained(f"/data/groups/QY_LLM_Other/pingbowen/models/mathlora/math_svd/") # get_tokenizer("/data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/") # save_full_model("/data/public/opensource_models/meta-llama/Llama-2-7b-hf/", "/data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/", os.path.join("/home/pingbowen/workspace/delta-compression/BitDelta/save", "diff_untrained.pt"), os.path.join("/home/pingbowen/workspace/delta-compression/BitDelta/save", "uncalibrated_model"), device="cuda") \ No newline at end of file