From 3a3c7688825736159cab6e554c5b3ab2a6b00ff5 Mon Sep 17 00:00:00 2001
From: pingbowen <pingbowen23@163.com>
Date: Mon, 4 Mar 2024 18:48:20 +0800
Subject: [PATCH 01/14] lowrank tailor

---
 bitdelta/diff.py              |  61 ++++++++++-
 bitdelta/diff.py.rej          |  38 +++++++
 bitdelta/train.py             |  57 ++++++-----
 bitdelta/train2.py            |  37 +++++++
 bitdelta/utils.py             |   5 +-
 eval.py                       |  30 ++++++
 lowbit_lowrank.py             |  23 +++++
 run.sh                        |  15 +++
 scripts/ppl_eval_example.bash |   6 +-
 tailor.py                     | 186 ++++++++++++++++++++++++++++++++++
 test.py                       | 110 ++++++++++++++++++++
 11 files changed, 532 insertions(+), 36 deletions(-)
 create mode 100644 bitdelta/diff.py.rej
 create mode 100644 bitdelta/train2.py
 create mode 100644 eval.py
 create mode 100644 lowbit_lowrank.py
 create mode 100644 run.sh
 create mode 100755 tailor.py
 create mode 100644 test.py

diff --git a/bitdelta/diff.py b/bitdelta/diff.py
index c2b03ce..faa1bbb 100644
--- a/bitdelta/diff.py
+++ b/bitdelta/diff.py
@@ -9,12 +9,13 @@ class BinaryDiff(nn.Module):
     def __init__(self, base, finetune):
         super().__init__()
         diff = finetune - base
+        # diff = decomposition(diff, 2048)
         quantile = diff.float().abs().mean()
 
         mask = torch.ones_like(diff)
         mask[diff < 0] = 0
         mask = pack(mask.bool().T)
-
+     
         self.register_buffer("mask", mask)
         self.register_buffer("base", base.T)
         self.register_parameter(
@@ -38,7 +39,15 @@ def forward(self, x):
         repeated_mask = self.mask.unsqueeze(0).repeat(x.size(0), 1, 1)
         return x @ self.base + self.coeff * binary_bmm(x, repeated_mask)
 
-def compress_diff(base_model, finetuned_model, finetuned_compressed_model):
+def Pass(layers=None,name=None):
+    if layers is not None:
+        for layer in layers:
+            if layer in name:
+                return True
+    return False
+
+
+def compress_diff(base_model, finetuned_model, finetuned_compressed_model,layers=None):
     def compress_submodule(name, subname, module, submodule):
         target_device = submodule.weight.device
                     
@@ -59,11 +68,15 @@ def compress_submodule(name, subname, module, submodule):
     # TODO: this can be parallelized
     for name, module in finetuned_compressed_model.named_modules():
         if "mlp" in name or "self_attn" in name:
+            
+            if Pass(layers,name) == True:
+                continue
+            
             for subname, submodule in module.named_children():
                 if "proj" in subname:
                     compress_submodule(name, subname, module, submodule)
 
-def save_diff(finetuned_compressed_model, save_dir):
+def save_diff(finetuned_compressed_model, save_dir,layers=None):
     diff_dict = {}
 
     for name, module in finetuned_compressed_model.named_modules():
@@ -91,6 +104,9 @@ def load_diff(model, diff_dir):
             # setattr(module, "mask", mask)
             # setattr(module, "coeff", coeff)
             weight = (unpack(mask)*2-1) * coeff
+            
+            if "mlp" in name:
+                weight = decomposition(weight, 1024)
 
             module.weight.add_(weight.T.to(module.weight.dtype))
         elif name + ".weight" in diff_dict:
@@ -105,11 +121,46 @@ def load_diff(model, diff_dir):
 
     model.config.vocab_size = model.lm_head.weight.size(0)
 
-def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, device):
+def decomposition(masked_input_tensor,dim):
+    # if "mlp" in name:
+    #     dim = int(dim * 1.45)
+    
+    U , S , V = torch.svd(masked_input_tensor)
+    # total_sum , partial_sum = torch.sum(S) , torch.sum(S[:128])
+    # import pdb; pdb.set_trace()
+    U , S , V = U[:, :dim],S[:dim] ,V[:, :dim]
+    return torch.mm(torch.mm(U, torch.diag(S)), V.t())
+
+def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, device,layers=None):
     base_model = get_model(base_model_name, device)
     tokenizer = get_tokenizer(finetuned_model_name)
+    
+    finetuned_model = get_model(finetuned_model_name, device)
+    # params = {}
+    
+    # for k ,v in finetuned_model.named_parameters():
+    #     if layers is not None:
+    #         for layer in layers:
+    #             if layer in k:
+    #                 if "mlp" in k or "self_attn" in k:
+    #                     delta =  v.detach().cpu() - base_model.get_submodule(k.replace('.weight',"")).weight.detach().cpu()
+    #                     dim = 128
+    #                     if "mlp" in k:  
+    #                         dim = int(dim * 1.45)
+    #                     # import pdb; pdb.set_trace()
+    #                     params[k] = decomposition(delta.to(torch.float32), dim).to(torch.bfloat16)
+
+    # import pdb; pdb.set_trace()
+    # dict(base_model.named_parameters())['model.layers.0.self_attn.o_proj.weight']
+    
+    # with torch.no_grad():
+    #     for param in params:
+    #         base_model.get_submodule(param.replace('.weight',"")).weight.add_(params[param].detach().to(device))
+        
+    # import pdb; pdb.set_trace()   
     load_diff(base_model, diff_dir)
-
+    
+     
     base_model.save_pretrained(save_dir)
     tokenizer.save_pretrained(save_dir)
 
diff --git a/bitdelta/diff.py.rej b/bitdelta/diff.py.rej
new file mode 100644
index 0000000..5f60f5f
--- /dev/null
+++ b/bitdelta/diff.py.rej
@@ -0,0 +1,38 @@
+diff a/bitdelta/diff.py b/bitdelta/diff.py	(rejected hunks)
+@@ -73,24 +86,31 @@ def save_diff(finetuned_compressed_model, save_dir):
+             diff_dict[name + ".coeff"] = module.coeff.cpu()
+ 
+     for name, param in finetuned_compressed_model.named_parameters():
++        if "mlp" in name or "self_attn" in name:
++            if Pass(layers,name) == True:
++                continue
++        
+         if param.requires_grad:
+             diff_dict[name] = param.cpu()
+-
++            
++    # import pdb; pdb.set_trace()
+     torch.save(diff_dict, save_dir)
+ 
+ @torch.no_grad()
+ def load_diff(model, diff_dir):
+     device = model.device
+     diff_dict = torch.load(diff_dir)
+-
++        
+     for name, module in model.named_modules():
+         if name + ".mask" in diff_dict:
+             coeff = diff_dict[name + ".coeff"].to(device)
+             mask = diff_dict[name + ".mask"].to(device)
+ 
+-            setattr(module, "mask", mask)
+-            setattr(module, "coeff", coeff)
+-            # module.weight.add_((mask * coeff).to(module.weight.dtype))
++            # setattr(module, "mask", mask)
++            # setattr(module, "coeff", coeff)
++            weight = (unpack(mask)*2-1) * coeff
++
++            module.weight.add_(weight.T.to(module.weight.dtype))
+         elif name + ".weight" in diff_dict:
+             module.weight = nn.Parameter(diff_dict[name + ".weight"].to(device).to(module.weight.dtype))
+ 
diff --git a/bitdelta/train.py b/bitdelta/train.py
index 946dafb..6ab3825 100644
--- a/bitdelta/train.py
+++ b/bitdelta/train.py
@@ -37,7 +37,7 @@
 finetuned_compressed_model = get_model(args.finetuned_model, args.finetuned_compressed_model_device, args.finetuned_compressed_model_memory_map)
 
 print(f"compressing diff...")
-compress_diff(base_model, finetuned_model, finetuned_compressed_model)
+compress_diff(base_model, finetuned_model, finetuned_compressed_model,layers=args.layers)
 
 train_num_samples = args.batch_size * args.num_steps
 train_dataset = get_dataset(
@@ -55,37 +55,38 @@
 )
 
 # save untrained delta
-save_diff(finetuned_compressed_model, os.path.join(args.save_dir, "diff_untrained.pt"))
+save_diff(finetuned_compressed_model, os.path.join(args.save_dir, "diff_untrained.pt"),layers=args.layers)
 
-optimizer = torch.optim.AdamW(finetuned_compressed_model.parameters(), lr=args.lr)
-scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.num_steps)
+if args.train:
+    optimizer = torch.optim.AdamW(finetuned_compressed_model.parameters(), lr=args.lr)
+    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.num_steps)
 
-bar = tqdm(train_dataloader)
+    bar = tqdm(train_dataloader)
 
-train_loss_list = []
+    train_loss_list = []
 
-# Train loop
-for step, batch in enumerate(bar):
-    batch1 = {k: v.to(finetuned_model.device) for k, v in batch.items()}
-    with torch.inference_mode():
-        finetuned_outputs = finetuned_model(**batch1)
+    # Train loop
+    for step, batch in enumerate(bar):
+        batch1 = {k: v.to(finetuned_model.device) for k, v in batch.items()}
+        with torch.inference_mode():
+            finetuned_outputs = finetuned_model(**batch1)
 
-    batch2 = {k: v.to(finetuned_compressed_model.device) for k, v in batch.items()}
-    finetuned_compressed_outputs = finetuned_compressed_model(**batch2)
+        batch2 = {k: v.to(finetuned_compressed_model.device) for k, v in batch.items()}
+        finetuned_compressed_outputs = finetuned_compressed_model(**batch2)
 
-    loss = F.mse_loss(
-        finetuned_outputs.logits.clone().to(finetuned_compressed_outputs.logits.device),
-        finetuned_compressed_outputs.logits,
-    )
+        loss = F.mse_loss(
+            finetuned_outputs.logits.clone().to(finetuned_compressed_outputs.logits.device),
+            finetuned_compressed_outputs.logits,
+        )
 
-    train_loss_list.append(loss.item())
+        train_loss_list.append(loss.item())
 
-    optimizer.zero_grad()
-    loss.backward()
-    optimizer.step()
-    scheduler.step()
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        scheduler.step()
 
-    bar.set_description(f"train loss: {loss.item()}")
+        bar.set_description(f"train loss: {loss.item()}")
 
 
 # save loss list
@@ -93,14 +94,14 @@
     with open(os.path.join(args.save_dir, f"train_loss_{args.num_groups}.json"), "w") as f:
         json.dump(train_loss_list, f)
 
-# save trained delta
-save_diff(finetuned_compressed_model, os.path.join(args.save_dir, "diff.pt"))
+# # save trained delta
+save_diff(finetuned_compressed_model, os.path.join(args.save_dir, "diff.pt"),layers=args.layers)
 
 del base_model, finetuned_model, finetuned_compressed_model
 torch.cuda.empty_cache()
 
 if args.save_full_model:
     print("saving uncalibrated model")
-    save_full_model(args.base_model, args.finetuned_model, os.path.join(args.save_dir, "diff_untrained.pt"), os.path.join(args.save_dir, "uncalibrated_model"), device="cpu")
-    print("saving calibrated model")
-    save_full_model(args.base_model, args.finetuned_model, os.path.join(args.save_dir, "diff.pt"), os.path.join(args.save_dir, "calibrated_model"), device="cpu")
+    save_full_model(args.base_model, args.finetuned_model, os.path.join(args.save_dir, "diff_untrained.pt"), os.path.join(args.save_dir, f"uncalibrated_model"), device="cpu",layers=args.layers)
+    # print("saving calibrated model")
+    # save_full_model(args.base_model, args.finetuned_model, os.path.join(args.save_dir, "diff.pt"), os.path.join(args.save_dir, "calibrated_model"), device="cpu")
diff --git a/bitdelta/train2.py b/bitdelta/train2.py
new file mode 100644
index 0000000..37c9c70
--- /dev/null
+++ b/bitdelta/train2.py
@@ -0,0 +1,37 @@
+import os
+
+import torch
+
+import torch.nn.functional as F
+from bitdelta.diff import compress_diff, save_diff, save_full_model
+from bitdelta.misc import find_corr_stddev
+
+from bitdelta.utils import get_model, parse_args, get_tokenizer
+from tqdm import tqdm
+from bitdelta.data import get_dataset, get_dataloader
+
+import json
+
+args = parse_args()
+
+# create save_dir if it doesn't exist
+os.makedirs(args.save_dir, exist_ok=True)
+
+tokenizer = get_tokenizer(args.base_model)
+
+with torch.no_grad():
+    base_model = get_model(args.base_model, args.base_model_device, args.base_model_memory_map)
+    finetuned_model = get_model(args.finetuned_model, args.finetuned_model_device, args.finetuned_model_memory_map)
+
+finetuned_compressed_model = get_model(args.finetuned_model, args.finetuned_compressed_model_device, args.finetuned_compressed_model_memory_map)
+
+print(f"compressing diff...")
+compress_diff(base_model, finetuned_model, finetuned_compressed_model)
+
+# save untrained delta
+save_diff(finetuned_compressed_model, os.path.join(args.save_dir, "diff_untrained.pt"))
+
+
+if args.save_full_model:
+    print("saving uncalibrated model")
+    save_full_model(args.base_model, args.finetuned_model, os.path.join(args.save_dir, "diff_untrained.pt"), os.path.join(args.save_dir, "uncalibrated_model"), device="cpu")
diff --git a/bitdelta/utils.py b/bitdelta/utils.py
index a7c55ea..1304239 100644
--- a/bitdelta/utils.py
+++ b/bitdelta/utils.py
@@ -21,9 +21,11 @@ def parse_args():
     parser.add_argument("--lr", type=float, default=1e-4)
     parser.add_argument("--num_steps", type=int, default=100)
     parser.add_argument("--batch_size", type=int, default=4)
+    parser.add_argument("--layers", nargs='+', default=None)
+    parser.add_argument("--save_num", type=int, default=0)
     parser.add_argument("--max_length", type=int, default=128)
     parser.add_argument("--save_dir", type=str, required=True)
-
+    parser.add_argument("--train", action="store_true")
 
     # device management
     parser.add_argument("--base_model_device", type=str, default="0")
@@ -102,6 +104,7 @@ def get_model(model_name, device, memory_map=None):
     else: # single-gpu or cpu
         return transformers.AutoModelForCausalLM.from_pretrained(
             model_name,
+            # torch_dtype=torch.float16,
             torch_dtype=torch.bfloat16,
             low_cpu_mem_usage=True,
         ).to(device)
diff --git a/eval.py b/eval.py
new file mode 100644
index 0000000..5e813b1
--- /dev/null
+++ b/eval.py
@@ -0,0 +1,30 @@
+import argparse
+import transformers
+import torch
+from transformers import AutoConfig, AutoModelForCausalLM
+
+def load_model(model_name):
+    model = transformers.AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype=torch.bfloat16,
+                low_cpu_mem_usage=True,)
+    return model  
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--base_model', type=str)
+    parser.add_argument('--finetuned_model', type=str)
+    args = parser.parse_args()
+
+    base_model = load_model(args.base_model)
+    finetuned_model = load_model(args.finetuned_model)
+    
+    params = dict()
+    
+    for n,p in finetuned_model.named_parameters():
+        if "mlp" in n or "self_attn" in n:
+            delta = p - base_model.state_dict()[n]
+            w = torch.sum(torch.abs(delta))
+            params[n] = w.item()
+    
+    print(params)
\ No newline at end of file
diff --git a/lowbit_lowrank.py b/lowbit_lowrank.py
new file mode 100644
index 0000000..4f159ff
--- /dev/null
+++ b/lowbit_lowrank.py
@@ -0,0 +1,23 @@
+import os
+
+import torch
+
+import torch.nn.functional as F
+from bitdelta.diff import compress_diff, save_diff, save_full_model
+from bitdelta.misc import find_corr_stddev
+
+from bitdelta.utils import get_model, parse_args, get_tokenizer
+from tqdm import tqdm
+
+args = parse_args()
+
+tokenizer = get_tokenizer(args.base_model)
+
+with torch.no_grad():
+    base_model = get_model(args.base_model, args.base_model_device, args.base_model_memory_map)
+    finetuned_model = get_model(args.finetuned_model, args.finetuned_model_device, args.finetuned_model_memory_map)
+
+finetuned_compressed_model = get_model(args.finetuned_model, args.finetuned_compressed_model_device, args.finetuned_compressed_model_memory_map)
+
+print(f"compressing diff...")
+compress_diff(base_model, finetuned_model, finetuned_compressed_model)
diff --git a/run.sh b/run.sh
new file mode 100644
index 0000000..ef3b529
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,15 @@
+MODEL_SAVE_DIR=save/
+
+mkdir -p $MODEL_SAVE_DIR
+
+CUDA_VISIBLE_DEVICES=6,7 python \
+    bitdelta/train.py \
+    --base_model /data/public/opensource_models/meta-llama/Llama-2-7b-hf/ \
+    --finetuned_model /data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/ \
+    --save_dir $MODEL_SAVE_DIR \
+    --batch_size 4 \
+    --num_steps 200 \
+    --save_full_model True 
+
+    # --layers "layers.5."\
+    # /data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/
diff --git a/scripts/ppl_eval_example.bash b/scripts/ppl_eval_example.bash
index ee6cc6a..ba45351 100644
--- a/scripts/ppl_eval_example.bash
+++ b/scripts/ppl_eval_example.bash
@@ -1,8 +1,10 @@
+PPL_SAVE_DIR=save
+
 CUDA_VISIBLE_DEVICES=0 python \
     bitdelta/eval_ppl.py \
-    --base_model meta-llama/Llama-2-7b-hf \
+    --base_model /home/pingbowen/workspace/delta-compression/BitDelta/save/calibrated_model \
     --dataset_name wikitext \
     --subset wikitext-2-raw-v1 \
     --save_dir $PPL_SAVE_DIR \
     --num_eval_samples 100 \
-    --model_diff $MODEL_SAVE_DIR/diff.pt \
\ No newline at end of file
+    # --model_diff $MODEL_SAVE_DIR/diff.pt \
\ No newline at end of file
diff --git a/tailor.py b/tailor.py
new file mode 100755
index 0000000..cf3143e
--- /dev/null
+++ b/tailor.py
@@ -0,0 +1,186 @@
+import argparse
+import jsonlines
+import sys
+import shutil
+import logging
+import os
+import time
+from tqdm import tqdm
+import glob
+import json
+import torch
+import datasets
+from transformers import AutoTokenizer, AutoModelForCausalLM
+# from vllm import LLM, SamplingParams
+import re
+import random
+import numpy as np
+
+pretrained_model_name = "/data/public/opensource_models/meta-llama/Llama-2-7b-hf"
+
+finetuned_model_name = "/data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf" # /data/public/wangshuo/exp/ft-en-magicoder-llama-2-7b/ckpts/checkpoints/epoch_2_hf
+
+pretrained_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=pretrained_model_name,
+                                        device_map="cpu")
+pretrained_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=pretrained_model_name)
+finetuned_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=finetuned_model_name,
+                                     device_map="cpu")
+finetuned_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=finetuned_model_name)
+
+save_dir = "/home/pingbowen/workspace/delta-compression/BitDelta/save/uncalibrated_model"
+
+def set_random_seed(seed: int = 0):
+    """
+    set random seed
+    :param seed: int, random seed
+    :return:
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+
+set_random_seed(seed=0)
+# scale_factor = finetuned_model.config.intermediate_size / finetuned_model.config.hidden_size
+
+
+def get_param_names_to_merge(input_param_names: list, exclude_param_names_regex: list):
+    """
+    get the names of parameters that need to be merged
+    :param input_param_names: list, names of input parameters
+    :param exclude_param_names_regex: list, regular expression of names of parameters that need to be excluded
+    :return:
+    """
+    param_names_to_merge = []
+    for param_name in input_param_names:
+        exclude = any([re.match(exclude_pattern, param_name) for exclude_pattern in exclude_param_names_regex])
+        if not exclude:
+            param_names_to_merge.append(param_name)
+    return param_names_to_merge
+
+
+
+task_vector_param_dict = {}
+pretrained_param_dict = {param_name: param_value for param_name, param_value in pretrained_model.named_parameters()}
+finetuned_param_dict = {param_name: param_value for param_name, param_value in finetuned_model.named_parameters()}
+# param_names_to_merge = get_param_names_to_merge(input_param_names=list(pretrained_param_dict.keys()), exclude_param_names_regex=[])
+# with torch.no_grad():
+#     for param_name in finetuned_param_dict.keys():
+#         task_vector_param_dict[param_name] = finetuned_param_dict[param_name] - pretrained_param_dict[param_name]
+#         print(f"name {param_name} data {task_vector_param_dict[param_name]} ")
+
+
+# import pdb
+# pdb.set_trace()
+
+def decomposition(masked_input_tensor,dim):
+
+    U , S , V = torch.svd(masked_input_tensor)
+    U , S , V = U[:, :dim],S[:dim],V[:, :dim]
+    # return torch.mm(U, torch.diag(S)), V.t()
+    # return U, torch.mm(torch.diag(S), V.t())   #return lora_B, lora_A
+    return torch.mm(torch.mm(U, torch.diag(S)), V.t())
+
+# dim = 256
+dim = 128
+# dim = 16
+print("----------------------dim: ",dim)
+print("----------------------dim: ",dim)
+print("----------------------dim: ",dim)
+print("----------------------dim: ",dim)
+print("----------------------dim: ",dim)
+print("----------------------dim: ",dim)
+
+peft_dict = {}
+malign_dict = {}
+other_dict = {}
+
+# finetuned_param_dict
+# for param_name, param_value in tqdm(pretrained_param_dict.items()):
+#     if "self_attn" in param_name or "mlp" in param_name:
+#         pass
+#     else:
+#         other_dict[param_name] = param_value.contiguous()
+
+diff = dict()
+
+for param_name, param_value in tqdm(finetuned_param_dict.items()):
+    if "self_attn" in param_name or "mlp" in param_name:
+        delta = param_value - pretrained_param_dict[param_name]
+        if "mlp" in param_name:
+            dim = int(dim * 1.45)
+        delta = decomposition(delta,dim=dim)
+        diff[param_name] = (pretrained_param_dict[param_name] + delta).contiguous()
+    else:
+        diff[param_name] = param_value.contiguous()
+        # lora_A = lora_A * (dim/16)  ###补偿scaling, 以后的alpha可以统一为16
+        # peft_key = "base_model.model." + param_name.split(".weight")[0]
+        # print(peft_key+".lora_A.weight")
+        # peft_dict[peft_key+".lora_A.weight"] = lora_A.contiguous()
+        # peft_dict[peft_key+".lora_B.weight"] = lora_B.contiguous()
+
+for n,p in pretrained_model.named_parameters():
+    p.data.copy_(diff[n])
+    
+pretrained_model.save_pretrained(save_dir)
+finetuned_tokenizer.save_pretrained(save_dir)
+
+# other_dict = {k: v.to(torch.float16) for k, v in other_dict.items()}
+
+# other_para_path = "/home/wanghanqing/projects/exp/mAlign_exp/lang_LoRAs/peft_ver/trim_lora/code/other_param"
+# torch.save(other_dict, os.path.join(other_para_path, "other.pt"))
+# torch.save(other_dict, os.path.join(other_para_path, "pretrain_other.pt"))
+
+
+# peft_dict = {k: v.to(torch.float16) for k, v in peft_dict.items()}
+
+# layernum = 40
+# for lnum in range(layernum):
+#     peft_pfx = f"base_model.model.model.layers.{lnum}"
+#     delta_pfx = f"encoder.layers.{lnum}" 
+#     malign_dict[f"{delta_pfx}.self_att.self_attention.project_q_lora.lora_A.weight"] = peft_dict[f"{peft_pfx}.self_attn.q_proj.lora_A.weight"].contiguous()
+#     malign_dict[f"{delta_pfx}.self_att.self_attention.project_q_lora.lora_B.weight"] = peft_dict[f"{peft_pfx}.self_attn.q_proj.lora_B.weight"].contiguous()
+#     malign_dict[f"{delta_pfx}.self_att.self_attention.project_k_lora.lora_A.weight"] = peft_dict[f"{peft_pfx}.self_attn.k_proj.lora_A.weight"].contiguous()
+#     malign_dict[f"{delta_pfx}.self_att.self_attention.project_k_lora.lora_B.weight"] = peft_dict[f"{peft_pfx}.self_attn.k_proj.lora_B.weight"].contiguous()
+#     malign_dict[f"{delta_pfx}.self_att.self_attention.project_v_lora.lora_A.weight"] = peft_dict[f"{peft_pfx}.self_attn.v_proj.lora_A.weight"].contiguous()
+#     malign_dict[f"{delta_pfx}.self_att.self_attention.project_v_lora.lora_B.weight"] = peft_dict[f"{peft_pfx}.self_attn.v_proj.lora_B.weight"].contiguous()
+#     malign_dict[f"{delta_pfx}.self_att.self_attention.attention_out_lora.lora_A.weight"] = peft_dict[f"{peft_pfx}.self_attn.o_proj.lora_A.weight"].contiguous()
+#     malign_dict[f"{delta_pfx}.self_att.self_attention.attention_out_lora.lora_B.weight"] = peft_dict[f"{peft_pfx}.self_attn.o_proj.lora_B.weight"].contiguous()
+#     malign_dict[f"{delta_pfx}.ffn.ffn.w_in.w_0_lora.lora_A.weight"] = peft_dict[f"{peft_pfx}.mlp.gate_proj.lora_A.weight"].contiguous()
+#     malign_dict[f"{delta_pfx}.ffn.ffn.w_in.w_0_lora.lora_B.weight"] = peft_dict[f"{peft_pfx}.mlp.gate_proj.lora_B.weight"].contiguous()
+#     malign_dict[f"{delta_pfx}.ffn.ffn.w_in.w_1_lora.lora_A.weight"] = peft_dict[f"{peft_pfx}.mlp.up_proj.lora_A.weight"].contiguous()
+#     malign_dict[f"{delta_pfx}.ffn.ffn.w_in.w_1_lora.lora_B.weight"] = peft_dict[f"{peft_pfx}.mlp.up_proj.lora_B.weight"].contiguous()
+#     malign_dict[f"{delta_pfx}.ffn.ffn.w_out_lora.lora_A.weight"] = peft_dict[f"{peft_pfx}.mlp.down_proj.lora_A.weight"].contiguous()
+#     malign_dict[f"{delta_pfx}.ffn.ffn.w_out_lora.lora_B.weight"] = peft_dict[f"{peft_pfx}.mlp.down_proj.lora_B.weight"].contiguous()
+
+
+
+
+
+# malign_dict = {k: v.to(torch.float16) for k, v in malign_dict.items()}
+
+# import pdb
+# pdb.set_trace()
+
+output_peft_path = "/home/wanghanqing/projects/exp/mAlign_exp/lang_LoRAs/peft_ver/trim_lora/dim256_2/code"
+output_malign_path = "/home/wanghanqing/projects/exp/mAlign_exp/mAlign_LoRAs/trim_lora/dim256_2/code"
+
+# torch.save(peft_dict, os.path.join(output_peft_path, "adapter_model.bin"))
+# torch.save(malign_dict, os.path.join(output_malign_path, "lora.pt"))
+
+
+print("--end--")
+
+
+# for param_name, param_value in finetuned_model.named_parameters():
+#     if param_name in masked_param_dict:
+#         param_value.data.copy_(masked_param_dict[param_name])
+
+# logger.info(f"saving model at {save_model_path}...")
+# os.makedirs(save_model_path, exist_ok=True)
+# finetuned_model.save_pretrained(save_directory=save_model_path)
+# finetuned_tokenizer.save_pretrained(save_directory=save_model_path)
+# logger.info(f"model is saved")
\ No newline at end of file
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..efdb2bf
--- /dev/null
+++ b/test.py
@@ -0,0 +1,110 @@
+import argparse
+import transformers
+import torch
+from transformers import AutoConfig, AutoModelForCausalLM
+from accelerate import infer_auto_device_map, init_empty_weights
+import torch.nn as nn
+import os
+from llava.model.language_model.llava_llama import LlavaConfig
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from llava.model import *
+
+def get_tokenizer(tokenizer_name):
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        tokenizer_name, use_fast=False, 
+    )
+
+    if tokenizer.pad_token_id is None:
+        if tokenizer.eos_token_id is not None:
+            tokenizer.pad_token_id = tokenizer.eos_token_id
+        else:
+            tokenizer.pad_token_id = 0
+
+    return tokenizer
+
+@torch.no_grad()
+def load_diff(model, diff_dir):
+    device = model.device
+    diff_dict = torch.load(diff_dir)
+
+    for name, module in model.named_modules():
+        if name + ".mask" in diff_dict:
+            coeff = diff_dict[name + ".coeff"].to(device)
+            mask = diff_dict[name + ".mask"].to(device)
+
+            setattr(module, "mask", mask)
+            setattr(module, "coeff", coeff)
+            # module.weight.add_((mask * coeff).to(module.weight.dtype))
+        elif name + ".weight" in diff_dict:
+            module.weight = nn.Parameter(diff_dict[name + ".weight"].to(device).to(module.weight.dtype))
+
+        elif name + '.A' in diff_dict:
+            A = diff_dict[name + '.A'].to(device)
+            B = diff_dict[name + '.B'].to(device)
+
+            mask = (A @ B).T
+            module.weight.add_(mask.to(module.weight.dtype))
+
+    model.config.vocab_size = model.lm_head.weight.size(0)
+
+
+def get_model(model_name, device, memory_map=None):
+    # multi-gpu
+    if device == "auto" or isinstance(device, list):
+        
+        # if gpus are specified, distributes according to the memory map
+        if isinstance(device, list):
+            assert memory_map is not None, "memory_map must be specified when using multiple gpus"
+            config = AutoConfig.from_pretrained(model_name)
+            with init_empty_weights():
+                model = AutoModelForCausalLM.from_config(config)
+
+            device_map = infer_auto_device_map(model, memory_map, no_split_module_classes=["LlamaDecoderLayer"])
+
+        else:
+            # use all available gpus
+            device_map = "auto"
+
+        return transformers.AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.bfloat16,
+            device_map=device_map,
+        )
+    else: # single-gpu or cpu
+        return transformers.AutoModelForCausalLM.from_pretrained(
+            model_name,
+            # torch_dtype=torch.bfloat16,
+            low_cpu_mem_usage=True,
+        )
+
+
+def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, device):
+    base_model = get_model(base_model_name, device)
+    tokenizer = get_tokenizer(finetuned_model_name)
+    load_diff(base_model, diff_dir)
+
+    base_model.save_pretrained(save_dir)
+    tokenizer.save_pretrained(save_dir)
+
+    del base_model
+
+model_path = "/home/pingbowen/models/Llava-v1-vicuna/Llava-v1/"
+
+lora_cfg_pretrained = LlavaConfig.from_pretrained(model_path)
+tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+print('Loading LLaVA from base model...')
+model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
+token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
+if model.lm_head.weight.shape[0] != token_num:
+    model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
+    model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
+
+
+
+# base_model = get_model("/home/pingbowen/models/Llava-v1-vicuna/Llava-v1/", "cuda")
+# params = base_model.state_dict()
+
+# print(params.keys())
+
+# get_tokenizer("/data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/")
+# save_full_model("/data/public/opensource_models/meta-llama/Llama-2-7b-hf/", "/data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/", os.path.join("/home/pingbowen/workspace/delta-compression/BitDelta/save", "diff_untrained.pt"), os.path.join("/home/pingbowen/workspace/delta-compression/BitDelta/save", "uncalibrated_model"), device="cuda")
\ No newline at end of file

From 86d443d17d33a221a413aade0e5f1cafa0d77e0b Mon Sep 17 00:00:00 2001
From: pingbowen <pingbowen23@163.com>
Date: Tue, 12 Mar 2024 15:07:43 +0800
Subject: [PATCH 02/14] finish fp16 + 1bit, check diff2.py train2.py

---
 bitdelta/diff.py   |  62 +++++++++-------
 bitdelta/diff2.py  | 173 +++++++++++++++++++++++++++++++++++++++++++++
 bitdelta/train.py  |  11 ++-
 bitdelta/train2.py |  14 ++--
 run.sh             |   4 +-
 tailor.py          | 108 ++++++++++++++--------------
 test.py            |  25 ++++---
 7 files changed, 296 insertions(+), 101 deletions(-)
 create mode 100644 bitdelta/diff2.py

diff --git a/bitdelta/diff.py b/bitdelta/diff.py
index faa1bbb..2f97ab7 100644
--- a/bitdelta/diff.py
+++ b/bitdelta/diff.py
@@ -9,7 +9,7 @@ class BinaryDiff(nn.Module):
     def __init__(self, base, finetune):
         super().__init__()
         diff = finetune - base
-        # diff = decomposition(diff, 2048)
+        diff = decomposition(diff, st=64, ed=1024)
         quantile = diff.float().abs().mean()
 
         mask = torch.ones_like(diff)
@@ -66,17 +66,28 @@ def compress_submodule(name, subname, module, submodule):
         setattr(module, subname, compressed)
 
     # TODO: this can be parallelized
+    # flag = False
     for name, module in finetuned_compressed_model.named_modules():
-        if "mlp" in name or "self_attn" in name:
-            
-            if Pass(layers,name) == True:
-                continue
-            
+        # if flag == True:
+        #     break
+        
+        if "self_attn" in name:
             for subname, submodule in module.named_children():
                 if "proj" in subname:
                     compress_submodule(name, subname, module, submodule)
-
-def save_diff(finetuned_compressed_model, save_dir,layers=None):
+        elif "mlp" in name:
+            with torch.no_grad():
+                for subname, submodule in module.named_children():
+                    if "proj" in subname:
+                        base_weight = base_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device)
+                        finetuned_weight = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device)
+                        delta = decomposition(finetuned_weight - base_weight,dim=int(128 * 1.45)) 
+                        finetuned_compressed_model.get_submodule(f"{name}.{subname}").weight.copy_(base_weight + delta.to(torch.bfloat16))
+                        # flag = True
+                        # import pdb; pdb.set_trace()
+                        # break
+
+def save_diff(finetuned_compressed_model, save_dir,layers=None,ori_diff=None):
     diff_dict = {}
 
     for name, module in finetuned_compressed_model.named_modules():
@@ -92,9 +103,10 @@ def save_diff(finetuned_compressed_model, save_dir,layers=None):
     torch.save(diff_dict, save_dir)
 
 @torch.no_grad()
-def load_diff(model, diff_dir):
+def load_diff(model, diff_dir,ori_diff):
     device = model.device
     diff_dict = torch.load(diff_dir)
+    # ori_diff = torch.load(ori_diff)
 
     for name, module in model.named_modules():
         if name + ".mask" in diff_dict:
@@ -104,13 +116,15 @@ def load_diff(model, diff_dir):
             # setattr(module, "mask", mask)
             # setattr(module, "coeff", coeff)
             weight = (unpack(mask)*2-1) * coeff
-            
-            if "mlp" in name:
-                weight = decomposition(weight, 1024)
+            weight_fp16 = decomposition(ori_diff[name + ".weight"].to(torch.float32), dim=64).to(torch.bfloat16)
+            # import pdb; pdb.set_trace()
 
-            module.weight.add_(weight.T.to(module.weight.dtype))
+            module.weight.add_(weight_fp16.to(module.weight.dtype) + weight.T.to(module.weight.dtype))
         elif name + ".weight" in diff_dict:
             module.weight = nn.Parameter(diff_dict[name + ".weight"].to(device).to(module.weight.dtype))
+            
+            # if "mlp" in name:
+            #     import pdb; pdb.set_trace()
 
         elif name + '.A' in diff_dict:
             A = diff_dict[name + '.A'].to(device)
@@ -121,17 +135,18 @@ def load_diff(model, diff_dir):
 
     model.config.vocab_size = model.lm_head.weight.size(0)
 
-def decomposition(masked_input_tensor,dim):
-    # if "mlp" in name:
-    #     dim = int(dim * 1.45)
+def decomposition(masked_input_tensor,dim=None,st=None,ed=None):
+    U , S , V = torch.svd(masked_input_tensor.to(torch.float32))
+    
+    if dim is not None:
+        U , S , V = U[:, :dim],S[:dim] ,V[:, :dim]
+    
+    if st is not None and ed is not None:
+        U , S , V = U[:, st:ed],S[st:ed] ,V[:, st:ed]
     
-    U , S , V = torch.svd(masked_input_tensor)
-    # total_sum , partial_sum = torch.sum(S) , torch.sum(S[:128])
-    # import pdb; pdb.set_trace()
-    U , S , V = U[:, :dim],S[:dim] ,V[:, :dim]
     return torch.mm(torch.mm(U, torch.diag(S)), V.t())
 
-def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, device,layers=None):
+def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, device,layers=None,ori_diff=None):
     base_model = get_model(base_model_name, device)
     tokenizer = get_tokenizer(finetuned_model_name)
     
@@ -150,17 +165,14 @@ def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, d
     #                     # import pdb; pdb.set_trace()
     #                     params[k] = decomposition(delta.to(torch.float32), dim).to(torch.bfloat16)
 
-    # import pdb; pdb.set_trace()
     # dict(base_model.named_parameters())['model.layers.0.self_attn.o_proj.weight']
     
     # with torch.no_grad():
     #     for param in params:
     #         base_model.get_submodule(param.replace('.weight',"")).weight.add_(params[param].detach().to(device))
         
-    # import pdb; pdb.set_trace()   
-    load_diff(base_model, diff_dir)
+    load_diff(base_model, diff_dir,ori_diff=ori_diff)
     
-     
     base_model.save_pretrained(save_dir)
     tokenizer.save_pretrained(save_dir)
 
diff --git a/bitdelta/diff2.py b/bitdelta/diff2.py
new file mode 100644
index 0000000..30c986c
--- /dev/null
+++ b/bitdelta/diff2.py
@@ -0,0 +1,173 @@
+import torch
+import torch.nn as nn
+import gc
+
+from bitdelta.binary_gemm_kernel import pack, unpack, binary_bmm
+from bitdelta.utils import get_model, get_tokenizer
+
+class BinaryDiff(nn.Module):
+    def __init__(self, weight):
+        super().__init__()
+        diff = weight
+        quantile = diff.float().abs().mean()
+
+        mask = torch.ones_like(diff)
+        mask[diff < 0] = 0
+        mask = pack(mask.bool().T)
+     
+        self.register_buffer("mask", mask)
+        # self.register_buffer("base", base.T)
+        self.register_parameter(
+            "coeff",
+            nn.Parameter(
+                torch.tensor(
+                    quantile,
+                    dtype=torch.float32,
+                    requires_grad=True,
+                    device=weight.device,
+                )
+            ),
+        )
+        # del base, finetune, diff
+
+    def forward(self, x):
+        # print(x.shape, self.base.shape, self.coeff.shape, self.mask.shape)
+        # [B, seq, in] @ [in, out] + [B, seq, in] @ [B, in/32, out]
+
+        # TODO: This can be faster
+        repeated_mask = self.mask.unsqueeze(0).repeat(x.size(0), 1, 1)
+        return x @ self.base + self.coeff * binary_bmm(x, repeated_mask)
+
+def Pass(layers=None,name=None):
+    if layers is not None:
+        for layer in layers:
+            if layer in name:
+                return True
+    return False
+
+
+def compress_diff(base_model, finetuned_model, finetuned_compressed_model,save_dir,layers=None):
+    def compress_submodule(name, subname, module, submodule):
+        target_device = submodule.weight.device
+                    
+        base_weight = base_model.get_submodule(f"{name}.{subname}").weight.detach().to(target_device)
+        finetuned_weight = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(target_device)
+
+        compressed = BinaryDiff(
+            base=base_weight,
+            finetune=finetuned_weight,
+        ).to(target_device)
+
+        del submodule, base_weight
+        setattr(module, subname, None)
+        gc.collect()
+        torch.cuda.empty_cache()
+        setattr(module, subname, compressed)
+
+    # TODO: this can be parallelized
+    for name, module in finetuned_compressed_model.named_modules():
+        
+        if "self_attn" in name:
+            for subname, submodule in module.named_children():
+                if "proj" in subname:
+                    base_weight = base_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device)
+                    finetuned_weight = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device)
+                    # compress_submodule(name, subname, module, submodule)
+                    U,S,V = decomposition(finetuned_weight - base_weight,dim=1024)
+                    
+                    compressed_U, compressed_V = BinaryDiff(weight=U[:,64:]).to(finetuned_weight.device), BinaryDiff(weight=V[:,64:]).to(finetuned_weight.device)
+                    U_mask, U_coeff, V_mask, V_coeff = compressed_U.mask, compressed_U.coeff, compressed_V.mask, compressed_V.coeff
+                    weight_U , weight_V = (unpack(U_mask)*2-1) * U_coeff, (unpack(V_mask)*2-1) * V_coeff
+                    # import pdb; pdb.set_trace()
+                    U[:,64:] , V[:,64:] = weight_U.T, weight_V.T   # 不确定是否有bug
+                    delta = U @ torch.diag(S) @ V.t()
+                    with torch.no_grad():
+                        finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(base_weight + delta.to(torch.bfloat16)) 
+                    
+                    
+        elif "mlp" in name:
+            with torch.no_grad():
+                for subname, submodule in module.named_children():
+                    if "proj" in subname:
+                        base_weight = base_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device)
+                        finetuned_weight = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device)
+                        U,S,V = decomposition(finetuned_weight - base_weight,dim=int(128 * 1.45)) 
+                        delta = torch.mm(torch.mm(U, torch.diag(S)), V.t())
+                        finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(base_weight + delta.to(torch.bfloat16))
+    
+    
+    finetuned_model.save_pretrained(save_dir)
+
+def save_diff(finetuned_compressed_model, save_dir,layers=None,ori_diff=None):
+    diff_dict = {}
+
+    for name, module in finetuned_compressed_model.named_modules():
+        if isinstance(module, BinaryDiff):
+            # diff_dict[name + ".mask"] = (module.mask == 1).bool().cpu()
+            diff_dict[name + ".mask"] = module.mask.cpu()
+            diff_dict[name + ".coeff"] = module.coeff.cpu()
+
+    for name, param in finetuned_compressed_model.named_parameters():
+        if param.requires_grad:
+            diff_dict[name] = param.cpu()
+
+    torch.save(diff_dict, save_dir)
+
+@torch.no_grad()
+def load_diff(model, diff_dir,ori_diff):
+    device = model.device
+    diff_dict = torch.load(diff_dir)
+    # ori_diff = torch.load(ori_diff)
+
+    for name, module in model.named_modules():
+        if name + ".mask" in diff_dict:
+            coeff = diff_dict[name + ".coeff"].to(device)
+            mask = diff_dict[name + ".mask"].to(device)
+
+            # setattr(module, "mask", mask)
+            # setattr(module, "coeff", coeff)
+            weight = (unpack(mask)*2-1) * coeff
+            weight_fp16 = decomposition(ori_diff[name + ".weight"].to(torch.float32), dim=64).to(torch.bfloat16)
+            # import pdb; pdb.set_trace()
+
+            module.weight.add_(weight_fp16.to(module.weight.dtype) + weight.T.to(module.weight.dtype))
+        elif name + ".weight" in diff_dict:
+            module.weight = nn.Parameter(diff_dict[name + ".weight"].to(device).to(module.weight.dtype))
+            
+            # if "mlp" in name:
+            #     import pdb; pdb.set_trace()
+
+        elif name + '.A' in diff_dict:
+            A = diff_dict[name + '.A'].to(device)
+            B = diff_dict[name + '.B'].to(device)
+
+            mask = (A @ B).T
+            module.weight.add_(mask.to(module.weight.dtype))
+
+    model.config.vocab_size = model.lm_head.weight.size(0)
+
+def decomposition(masked_input_tensor,dim=None,st=None,ed=None,name=None):
+    U , S , V = torch.svd(masked_input_tensor.to(torch.float32))
+    
+    if dim is not None:
+        U , S , V = U[:, :dim],S[:dim] ,V[:, :dim]
+    
+    if st is not None and ed is not None:
+        U , S , V = U[:, st:ed],S[st:ed] ,V[:, st:ed]
+    
+    return U, S, V
+
+def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, device,layers=None,ori_diff=None):
+    base_model = get_model(base_model_name, device)
+    tokenizer = get_tokenizer(finetuned_model_name)
+    
+    finetuned_model = get_model(finetuned_model_name, device)
+    # params = {}
+        
+    load_diff(base_model, diff_dir,ori_diff=ori_diff)
+    
+    base_model.save_pretrained(save_dir)
+    tokenizer.save_pretrained(save_dir)
+
+    del base_model
+
diff --git a/bitdelta/train.py b/bitdelta/train.py
index 6ab3825..9e4bf97 100644
--- a/bitdelta/train.py
+++ b/bitdelta/train.py
@@ -23,6 +23,13 @@
     base_model = get_model(args.base_model, args.base_model_device, args.base_model_memory_map)
     finetuned_model = get_model(args.finetuned_model, args.finetuned_model_device, args.finetuned_model_memory_map)
 
+def original_diff(base_model, finetuned_model):
+    origin_diff = {}
+    for k, v in finetuned_model.named_parameters():
+        if "mlp" in k or "self_attn" in k:
+            origin_diff[k] = v.detach().cpu() - base_model.get_submodule(k.replace('.weight',"")).weight.detach().cpu()
+    return origin_diff
+
 # get corr/stddev stats
 if args.debug:
     print(f"finding corr/stddev stats...")
@@ -94,6 +101,8 @@
     with open(os.path.join(args.save_dir, f"train_loss_{args.num_groups}.json"), "w") as f:
         json.dump(train_loss_list, f)
 
+ori_diff = original_diff(base_model, finetuned_model)
+
 # # save trained delta
 save_diff(finetuned_compressed_model, os.path.join(args.save_dir, "diff.pt"),layers=args.layers)
 
@@ -102,6 +111,6 @@
 
 if args.save_full_model:
     print("saving uncalibrated model")
-    save_full_model(args.base_model, args.finetuned_model, os.path.join(args.save_dir, "diff_untrained.pt"), os.path.join(args.save_dir, f"uncalibrated_model"), device="cpu",layers=args.layers)
+    save_full_model(args.base_model, args.finetuned_model, os.path.join(args.save_dir, "diff_untrained.pt"), os.path.join(args.save_dir, f"uncalibrated_model"), device="cpu",layers=args.layers,ori_diff=ori_diff)
     # print("saving calibrated model")
     # save_full_model(args.base_model, args.finetuned_model, os.path.join(args.save_dir, "diff.pt"), os.path.join(args.save_dir, "calibrated_model"), device="cpu")
diff --git a/bitdelta/train2.py b/bitdelta/train2.py
index 37c9c70..eb9d66d 100644
--- a/bitdelta/train2.py
+++ b/bitdelta/train2.py
@@ -3,7 +3,7 @@
 import torch
 
 import torch.nn.functional as F
-from bitdelta.diff import compress_diff, save_diff, save_full_model
+from bitdelta.diff2 import compress_diff, save_diff, save_full_model
 from bitdelta.misc import find_corr_stddev
 
 from bitdelta.utils import get_model, parse_args, get_tokenizer
@@ -17,7 +17,7 @@
 # create save_dir if it doesn't exist
 os.makedirs(args.save_dir, exist_ok=True)
 
-tokenizer = get_tokenizer(args.base_model)
+tokenizer = get_tokenizer(args.finetuned_model)
 
 with torch.no_grad():
     base_model = get_model(args.base_model, args.base_model_device, args.base_model_memory_map)
@@ -26,12 +26,6 @@
 finetuned_compressed_model = get_model(args.finetuned_model, args.finetuned_compressed_model_device, args.finetuned_compressed_model_memory_map)
 
 print(f"compressing diff...")
-compress_diff(base_model, finetuned_model, finetuned_compressed_model)
+compress_diff(base_model, finetuned_model, finetuned_compressed_model,args.save_dir)
 
-# save untrained delta
-save_diff(finetuned_compressed_model, os.path.join(args.save_dir, "diff_untrained.pt"))
-
-
-if args.save_full_model:
-    print("saving uncalibrated model")
-    save_full_model(args.base_model, args.finetuned_model, os.path.join(args.save_dir, "diff_untrained.pt"), os.path.join(args.save_dir, "uncalibrated_model"), device="cpu")
+tokenizer.save_pretrained(args.save_dir)
diff --git a/run.sh b/run.sh
index ef3b529..8874caf 100644
--- a/run.sh
+++ b/run.sh
@@ -1,9 +1,9 @@
-MODEL_SAVE_DIR=save/
+MODEL_SAVE_DIR=save/uncalibrated_model_0
 
 mkdir -p $MODEL_SAVE_DIR
 
 CUDA_VISIBLE_DEVICES=6,7 python \
-    bitdelta/train.py \
+    bitdelta/train2.py \
     --base_model /data/public/opensource_models/meta-llama/Llama-2-7b-hf/ \
     --finetuned_model /data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/ \
     --save_dir $MODEL_SAVE_DIR \
diff --git a/tailor.py b/tailor.py
index cf3143e..270bdaa 100755
--- a/tailor.py
+++ b/tailor.py
@@ -15,20 +15,22 @@
 import re
 import random
 import numpy as np
+import math
 
-pretrained_model_name = "/data/public/opensource_models/meta-llama/Llama-2-7b-hf"
+parser = argparse.ArgumentParser()
+parser.add_argument('--finetuned_model_name', type=str, required=True, help='finetuned model name')
+parser.add_argument('--save_dir', type=str, required=True, help='finetuned model name')
+args = parser.parse_args()
 
-finetuned_model_name = "/data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf" # /data/public/wangshuo/exp/ft-en-magicoder-llama-2-7b/ckpts/checkpoints/epoch_2_hf
+pretrained_model_name = "/data/public/opensource_models/meta-llama/Llama-2-7b-hf"
 
-pretrained_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=pretrained_model_name,
+finetuned_model_name = args.finetuned_model_name # /data/public/wangshuo/exp/ft-en-magicoder-llama-2-7b/ckpts/checkpoints/epoch_2_hf
+pretrained_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=pretrained_model_name, 
                                         device_map="cpu")
 pretrained_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=pretrained_model_name)
-finetuned_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=finetuned_model_name,
+finetuned_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=finetuned_model_name, 
                                      device_map="cpu")
 finetuned_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=finetuned_model_name)
-
-save_dir = "/home/pingbowen/workspace/delta-compression/BitDelta/save/uncalibrated_model"
-
 def set_random_seed(seed: int = 0):
     """
     set random seed
@@ -47,6 +49,7 @@ def set_random_seed(seed: int = 0):
 # scale_factor = finetuned_model.config.intermediate_size / finetuned_model.config.hidden_size
 
 
+scale_factor = 1.45
 def get_param_names_to_merge(input_param_names: list, exclude_param_names_regex: list):
     """
     get the names of parameters that need to be merged
@@ -62,17 +65,6 @@ def get_param_names_to_merge(input_param_names: list, exclude_param_names_regex:
     return param_names_to_merge
 
 
-
-task_vector_param_dict = {}
-pretrained_param_dict = {param_name: param_value for param_name, param_value in pretrained_model.named_parameters()}
-finetuned_param_dict = {param_name: param_value for param_name, param_value in finetuned_model.named_parameters()}
-# param_names_to_merge = get_param_names_to_merge(input_param_names=list(pretrained_param_dict.keys()), exclude_param_names_regex=[])
-# with torch.no_grad():
-#     for param_name in finetuned_param_dict.keys():
-#         task_vector_param_dict[param_name] = finetuned_param_dict[param_name] - pretrained_param_dict[param_name]
-#         print(f"name {param_name} data {task_vector_param_dict[param_name]} ")
-
-
 # import pdb
 # pdb.set_trace()
 
@@ -81,12 +73,11 @@ def decomposition(masked_input_tensor,dim):
     U , S , V = torch.svd(masked_input_tensor)
     U , S , V = U[:, :dim],S[:dim],V[:, :dim]
     # return torch.mm(U, torch.diag(S)), V.t()
-    # return U, torch.mm(torch.diag(S), V.t())   #return lora_B, lora_A
-    return torch.mm(torch.mm(U, torch.diag(S)), V.t())
+    return torch.mm(U, torch.mm(torch.diag(S), V.t()))   #return lora_B, lora_A
 
-# dim = 256
+# dim = 1024
 dim = 128
-# dim = 16
+# dim = 64
 print("----------------------dim: ",dim)
 print("----------------------dim: ",dim)
 print("----------------------dim: ",dim)
@@ -98,35 +89,34 @@ def decomposition(masked_input_tensor,dim):
 malign_dict = {}
 other_dict = {}
 
-# finetuned_param_dict
-# for param_name, param_value in tqdm(pretrained_param_dict.items()):
+task_vector_param_dict = {}
+pretrained_param_dict = {param_name: param_value for param_name, param_value in pretrained_model.named_parameters()}
+finetuned_param_dict = {param_name: param_value for param_name, param_value in finetuned_model.named_parameters()}
+param_names_to_merge = get_param_names_to_merge(input_param_names=list(pretrained_param_dict.keys()), exclude_param_names_regex=[])
+with torch.no_grad():
+    for param_name in param_names_to_merge:
+        if "self_attn" in param_name or "mlp" in param_name:
+            # import pdb ;pdb.set_trace()
+            if "mlp" in param_name:
+                dim = math.ceil(dim * scale_factor)
+            
+            delta = decomposition(finetuned_param_dict[param_name] - pretrained_param_dict[param_name],dim=dim)
+            finetuned_model.get_submodule(param_name.replace(".weight", "")).weight.copy_(pretrained_model.get_submodule(param_name.replace(".weight", "")).weight + delta)
+            # print(f"name {param_name} data {task_vector_param_dict[param_name]} ") 
+
+
+finetuned_model.save_pretrained(save_directory=args.save_dir)
+finetuned_tokenizer.save_pretrained(save_directory=args.save_dir)
+
+# for param_name, param_value in tqdm(task_vector_param_dict.items()):
 #     if "self_attn" in param_name or "mlp" in param_name:
-#         pass
-#     else:
-#         other_dict[param_name] = param_value.contiguous()
-
-diff = dict()
-
-for param_name, param_value in tqdm(finetuned_param_dict.items()):
-    if "self_attn" in param_name or "mlp" in param_name:
-        delta = param_value - pretrained_param_dict[param_name]
-        if "mlp" in param_name:
-            dim = int(dim * 1.45)
-        delta = decomposition(delta,dim=dim)
-        diff[param_name] = (pretrained_param_dict[param_name] + delta).contiguous()
-    else:
-        diff[param_name] = param_value.contiguous()
-        # lora_A = lora_A * (dim/16)  ###补偿scaling, 以后的alpha可以统一为16
-        # peft_key = "base_model.model." + param_name.split(".weight")[0]
-        # print(peft_key+".lora_A.weight")
-        # peft_dict[peft_key+".lora_A.weight"] = lora_A.contiguous()
-        # peft_dict[peft_key+".lora_B.weight"] = lora_B.contiguous()
-
-for n,p in pretrained_model.named_parameters():
-    p.data.copy_(diff[n])
-    
-pretrained_model.save_pretrained(save_dir)
-finetuned_tokenizer.save_pretrained(save_dir)
+#         lora_B, lora_A = decomposition(param_value,dim=dim)
+#         lora_A = lora_A * (dim/16)  ###补偿scaling, 以后的alpha可以统一为16
+#         peft_key = "base_model.model." + param_name.split(".weight")[0]
+#         print(peft_key+".lora_A.weight")
+#         peft_dict[peft_key+".lora_A.weight"] = lora_A.contiguous()
+#         peft_dict[peft_key+".lora_B.weight"] = lora_B.contiguous()
+
 
 # other_dict = {k: v.to(torch.float16) for k, v in other_dict.items()}
 
@@ -135,7 +125,7 @@ def decomposition(masked_input_tensor,dim):
 # torch.save(other_dict, os.path.join(other_para_path, "pretrain_other.pt"))
 
 
-# peft_dict = {k: v.to(torch.float16) for k, v in peft_dict.items()}
+peft_dict = {k: v.to(torch.float16) for k, v in peft_dict.items()}
 
 # layernum = 40
 # for lnum in range(layernum):
@@ -160,7 +150,7 @@ def decomposition(masked_input_tensor,dim):
 
 
 
-# malign_dict = {k: v.to(torch.float16) for k, v in malign_dict.items()}
+malign_dict = {k: v.to(torch.float16) for k, v in malign_dict.items()}
 
 # import pdb
 # pdb.set_trace()
@@ -175,6 +165,20 @@ def decomposition(masked_input_tensor,dim):
 print("--end--")
 
 
+
+
+
+# num , masked_input_tensor = 0,input_tensor
+# if "self_attn" in param_name or "mlp" in param_name:
+#     if "mlp" in param_name:
+#             dim = math.ceil(dim * scale_factor)
+#             thresh_hold = 0.06752
+#     num, masked_input_tensor = decomposition(input_tensor,dim=dim)     
+
+
+
+
+
 # for param_name, param_value in finetuned_model.named_parameters():
 #     if param_name in masked_param_dict:
 #         param_value.data.copy_(masked_param_dict[param_name])
diff --git a/test.py b/test.py
index efdb2bf..1d8fe23 100644
--- a/test.py
+++ b/test.py
@@ -5,9 +5,9 @@
 from accelerate import infer_auto_device_map, init_empty_weights
 import torch.nn as nn
 import os
-from llava.model.language_model.llava_llama import LlavaConfig
+# from llava.model.language_model.llava_llama import LlavaConfig
 from transformers import AutoTokenizer, AutoModelForCausalLM
-from llava.model import *
+# from llava.model import *
 
 def get_tokenizer(tokenizer_name):
     tokenizer = transformers.AutoTokenizer.from_pretrained(
@@ -88,18 +88,21 @@ def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, d
 
     del base_model
 
-model_path = "/home/pingbowen/models/Llava-v1-vicuna/Llava-v1/"
 
-lora_cfg_pretrained = LlavaConfig.from_pretrained(model_path)
-tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
-print('Loading LLaVA from base model...')
-model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
-token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
-if model.lm_head.weight.shape[0] != token_num:
-    model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
-    model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
+A = torch.Tensor([[1, 2, 3],[6,5,4]])
+B = torch.Tensor([[9],[9]])
 
+A[:,-1:] = B
 
+print(A)
+# U,S,V = torch.svd(A)
+# # print("-----------------")
+
+# print(A.shape)
+# print("-----------------")
+# print(S.shape)
+# print("-----------------")
+# print(V)
 
 # base_model = get_model("/home/pingbowen/models/Llava-v1-vicuna/Llava-v1/", "cuda")
 # params = base_model.state_dict()

From 6836149e19514a49132f904604fc8e84d11c6c35 Mon Sep 17 00:00:00 2001
From: pingbowen <pingbowen23@163.com>
Date: Thu, 14 Mar 2024 09:30:55 +0800
Subject: [PATCH 03/14] Attn,mlp fp16+1bit

---
 bitdelta/diff2.py   |  30 ++---
 cosine_sim_check.py | 281 ++++++++++++++++++++++++++++++++++++++++++++
 run.sh              |   2 +-
 run_tailor.sh       |  11 ++
 4 files changed, 305 insertions(+), 19 deletions(-)
 create mode 100644 cosine_sim_check.py
 create mode 100644 run_tailor.sh

diff --git a/bitdelta/diff2.py b/bitdelta/diff2.py
index 30c986c..56ba348 100644
--- a/bitdelta/diff2.py
+++ b/bitdelta/diff2.py
@@ -64,16 +64,22 @@ def compress_submodule(name, subname, module, submodule):
         torch.cuda.empty_cache()
         setattr(module, subname, compressed)
 
-    # TODO: this can be parallelized
+    # TODO: 根据thresh 选择压缩比例
     for name, module in finetuned_compressed_model.named_modules():
-        
-        if "self_attn" in name:
+        if "self_attn" in name or "mlp" in name:
             for subname, submodule in module.named_children():
                 if "proj" in subname:
                     base_weight = base_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device)
                     finetuned_weight = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device)
-                    # compress_submodule(name, subname, module, submodule)
-                    U,S,V = decomposition(finetuned_weight - base_weight,dim=1024)
+                    dim , thresh = 1024,0.7
+                    
+                    if "mlp" in name:
+                        dim , thresh = 2048 , 0.24
+                    
+                    U,S,V = decomposition(finetuned_weight - base_weight,dim=dim)
+                    energy_total = torch.sum(S**2)
+                    energy_top_percent = torch.sum(S[:50]**2)
+                    ratio = energy_top_percent / energy_total
                     
                     compressed_U, compressed_V = BinaryDiff(weight=U[:,64:]).to(finetuned_weight.device), BinaryDiff(weight=V[:,64:]).to(finetuned_weight.device)
                     U_mask, U_coeff, V_mask, V_coeff = compressed_U.mask, compressed_U.coeff, compressed_V.mask, compressed_V.coeff
@@ -82,19 +88,7 @@ def compress_submodule(name, subname, module, submodule):
                     U[:,64:] , V[:,64:] = weight_U.T, weight_V.T   # 不确定是否有bug
                     delta = U @ torch.diag(S) @ V.t()
                     with torch.no_grad():
-                        finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(base_weight + delta.to(torch.bfloat16)) 
-                    
-                    
-        elif "mlp" in name:
-            with torch.no_grad():
-                for subname, submodule in module.named_children():
-                    if "proj" in subname:
-                        base_weight = base_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device)
-                        finetuned_weight = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device)
-                        U,S,V = decomposition(finetuned_weight - base_weight,dim=int(128 * 1.45)) 
-                        delta = torch.mm(torch.mm(U, torch.diag(S)), V.t())
-                        finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(base_weight + delta.to(torch.bfloat16))
-    
+                        finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(base_weight + delta.to(base_weight.dtype)) 
     
     finetuned_model.save_pretrained(save_dir)
 
diff --git a/cosine_sim_check.py b/cosine_sim_check.py
new file mode 100644
index 0000000..788937a
--- /dev/null
+++ b/cosine_sim_check.py
@@ -0,0 +1,281 @@
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
+import torch
+from torch import nn
+import gc
+import torch.nn.functional as F
+from bitdelta.diff import  save_diff, save_full_model
+from bitdelta.misc import find_corr_stddev
+from bitdelta.binary_gemm_kernel import pack, unpack, binary_bmm
+from bitdelta.utils import get_model, parse_args, get_tokenizer
+from tqdm import tqdm
+from bitdelta.data import get_dataset, get_dataloader
+
+import json
+import transformers
+
+import re
+import random
+import numpy as np
+
+def set_random_seed(seed: int = 0):
+    """
+    set random seed
+    :param seed: int, random seed
+    :return:
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+
+set_random_seed(seed=0)
+
+def get_param_names_to_merge(input_param_names: list, exclude_param_names_regex: list):
+    """
+    get the names of parameters that need to be merged
+    :param input_param_names: list, names of input parameters
+    :param exclude_param_names_regex: list, regular expression of names of parameters that need to be excluded
+    :return:
+    """
+    param_names_to_merge = []
+    for param_name in input_param_names:
+        exclude = any([re.match(exclude_pattern, param_name) for exclude_pattern in exclude_param_names_regex])
+        if not exclude:
+            param_names_to_merge.append(param_name)
+    return param_names_to_merge
+
+
+def get_model(model_path):
+    if "mistral" in model_path or "mixtral" in model_path:
+        data_type = torch.bfloat16
+    else:
+        data_type = torch.float16
+    with torch.no_grad():
+        model = transformers.AutoModelForCausalLM.from_pretrained(
+            model_path,
+            torch_dtype=data_type,
+            low_cpu_mem_usage=True,
+            # device_map="auto"   
+        ).to("cuda")
+    return model
+
+
+
+
+def singular_values_for_variance(tensor, variances=[0.9, 0.95]):
+    """
+    Calculate the minimum number of singular values needed to reach specified variance ratios.
+
+    Parameters:
+    - tensor: A 2D tensor for which to calculate the SVD.
+    - variances: A list of variance ratios to calculate the minimum number of singular values for.
+
+    Returns:
+    A dictionary with the variance ratios as keys and the minimum number of singular values needed as values.
+    """
+    # Compute SVD
+    U, S, V = torch.svd(tensor)
+    # Calculate the squared singular values (proportional to variance explained)
+    squared_singular_values = torch.pow(S, 2)
+    total_variance = torch.sum(squared_singular_values)
+    cumulative_variance_ratios = torch.cumsum(squared_singular_values, dim=0) / total_variance
+    
+    # Find the minimum number of singular values for each specified variance
+    results = {}
+    for variance in variances:
+        num_singular_values = torch.searchsorted(cumulative_variance_ratios, variance) + 1  # +1 because indices start at 0
+        results[variance] = num_singular_values.item()
+        
+    return results
+
+
+def cosine_similarity_matrix(finetuned_param, pretrained_param):
+    finetuned_flat = finetuned_param.view(-1)
+    pretrained_flat = pretrained_param.view(-1)
+    cosine_similarity = F.cosine_similarity(finetuned_flat.unsqueeze(0), pretrained_flat.unsqueeze(0), dim=1)
+    return cosine_similarity.item()
+
+
+def check_delta_properties(delta_weight):
+    # analysis properties for each linear weight in deltas
+
+    # 计算矩阵的Frobenius范数（二范数）
+    matrix_norm = torch.norm(delta_weight, p='fro')
+
+    # 计算矩阵的条件数 
+    # 矩阵的条件数（Condition Number）衡量的是矩阵求逆的数值稳定性。具体来说，它描述了原始数据的微小变化如何影响矩阵运算的结果。条件数越高，计算结果对数据的微小变化越敏感，即数值解可能不稳定；条件数越低，矩阵和其运算则越稳定。
+
+    # 定义
+    # 对于非奇异矩阵 A，其条件数定义为矩阵 A 的范数与 A 的逆的范数的乘积：
+    # 其中，范数可以是任意矩阵范数，但是最常用的是2-范数（即谱范数），此时条件数可以解释为矩阵最大奇异值与最小奇异值的比值。
+    cond_number = torch.linalg.cond(delta_weight)
+
+    # 计算矩阵的秩
+    rank = torch.linalg.matrix_rank(delta_weight)
+
+    # 计算矩阵的有效秩
+    rank_eff = singular_values_for_variance(delta_weight, variances=[0.9, 0.95])
+    rank_90, rank_95 = rank_eff[0.9], rank_eff[0.95]
+
+
+    return matrix_norm, cond_number, rank, rank_90, rank_95
+
+
+
+
+    ## First part: checkout cosine similarity in first layer FFN w1
+
+    # if "llama" in base_model_path:
+    #     #weight_key = "model.layers.0.mlp.gate_proj.weight"
+    #     tensor_base = base_model.model.layers[0].mlp.gate_proj.weight
+    #     tensor_ft = finetuned_model.model.layers[0].mlp.gate_proj.weight
+    #     cosine_sim = F.cosine_similarity(tensor_base, tensor_ft, dim=1)
+    #     overall_similarity = cosine_sim.mean()
+    #     base_model_name = base_model_path.split("/")[-1]
+    #     finetuned_model_name = finetuned_model_path.split("/")[-1]
+    #     overall_similarity_result = overall_similarity.item()
+    #     print(f"Overall Cosine Similarity between {base_model_name} and {finetuned_model_name}: {overall_similarity_result}")
+    #     ## 说明是llama模型
+    # elif "Mixtral" in base_model_path:
+    #     tensor_base = base_model.model.layers[0].block_sparse_moe.experts[0].w1.weight
+    #     tensor_ft = base_model.model.layers[0].block_sparse_moe.experts[1].w1.weight
+    #     cosine_sim = F.cosine_similarity(tensor_base, tensor_ft, dim=1)
+    #     overall_similarity = cosine_sim.mean()
+
+
+
+
+    ## Second part: checkout delta square decline potential using scaled weight
+
+    ## third part: checkout rank of original delta and  
+    ## scaled calculation delta(relation between variance ratio and #singular values)
+
+def analysis_delta(base_model_path, finetuned_model_path):
+    pretrained_model = get_model(base_model_path)
+    finetuned_model = get_model(finetuned_model_path)
+    print(f"We are analysising the delta between the Pretrained model: {base_model_path} and the Finetuned model: {finetuned_model_path}")
+    task_vector_param_dict = {}
+    pretrained_param_dict = {param_name: param_value for param_name, param_value in pretrained_model.named_parameters()}
+    finetuned_param_dict = {param_name: param_value for param_name, param_value in finetuned_model.named_parameters()}
+    param_names_to_merge = get_param_names_to_merge(input_param_names=list(pretrained_param_dict.keys()), exclude_param_names_regex=[])
+    
+    cos_sim_list = []
+    norm_list = []
+    cond_number_list = []
+    rank_list = []
+    rank_90_list = []
+    rank_95_list = []
+
+    with torch.no_grad():
+        for param_name in param_names_to_merge:
+            param_list = ['q_proj','k_proj','v_proj','o_proj','gate_proj','up_proj','down_proj']
+            if all(char not in param_name for char in param_list):
+                continue
+            # import pdb
+            # pdb.set_trace()
+            #研究finetuned_param_dict[param_name]和pretrained_param_dict[param_name]的cosine similarity
+            task_vector_param_dict[param_name] = finetuned_param_dict[param_name] - pretrained_param_dict[param_name]
+            #check similarity
+            print(f"Investigating param_name: {param_name}")
+            cos_sim = cosine_similarity_matrix(finetuned_param_dict[param_name].float(), pretrained_param_dict[param_name].float())
+            cos_sim_list.append(cos_sim)
+            print(f"cosine similarity between the finetuned model and pretrained model: ",cos_sim)
+            #研究他们差值的统计性质
+            matrix_norm, cond_number, rank, rank_90, rank_95 = check_delta_properties(task_vector_param_dict[param_name].float())
+            norm_list.append(matrix_norm)
+            cond_number_list.append(cond_number)
+            rank_list.append(rank)
+            rank_90_list.append(rank_90)
+            rank_95_list.append(rank_95)
+            print(f"Properties of Delta Weight---matrix_norm: {matrix_norm}, cond_number: {cond_number}, rank: {rank}, rank_90: {rank_90}, rank_95: {rank_95}")
+
+    print(f"avg_cos_sim: {sum(cos_sim_list)/len(cos_sim_list)}")
+    print(f"avg_norm: {sum(norm_list)/len(norm_list)}")
+    print(f"avg_cond_number: {sum(cond_number_list)/len(cond_number_list)}")
+    print(f"avg_rank: {sum(rank_list)/len(rank_list)}")
+    print(f"avg_rank_90: {sum(rank_90_list)/len(rank_90_list)}")
+    print(f"avg_rank_95: {sum(rank_95_list)/len(rank_95_list)}")
+
+    print(f"Analysis end for the pretrained model: {base_model_path} and finetuned_model: {finetuned_model_path}")
+    del pretrained_model
+    del finetuned_model
+    return
+
+moe_base = "/home/wanghanqing/projects/models/model_ver2/Mixtral-8x7B-v0.1"
+instruct_base = "/home/wanghanqing/projects/models/model_ver2/Mistral-7B-Instruct-v0.2"
+base_model = "/home/wanghanqing/projects/models/model_ver2/Mistral-7B-v0.1"
+
+code_llama13 = "/data/public/opensource_models/codellama/codellama-13b-python-hf"
+wizard_coder = "/data/public/opensource_models/WizardLM/WizardCoder-Python-13B-V1.0"
+llama2_7b = "/data/public/opensource_models/meta-llama/Llama-2-7b-hf"
+llama2_7b_chat = "/data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf"
+llama2_13b = "/data/public/opensource_models/meta-llama/Llama-2-13b-hf"
+llama2_13b_chat = "/data/public/opensource_models/meta-llama/Llama-2-13b-chat-hf"
+wizard_math_7b = "/data/public/opensource_models/WizardLM/WizardMath-7B-V1.0"
+wizard_math_13b = "/data/public/opensource_models/WizardLM/WizardMath-13B-V1.0"
+meta_math_7b = "/data/public/wangshuo/exp/ft-en-metameth-llama-2-7b/ckpts/checkpoints/epoch_2_hf"
+magicoder_7b = "/data/public/wangshuo/exp/ft-en-magicoder-llama-2-7b/ckpts/checkpoints/epoch_2_hf"
+magicoder_13b = "/data/public/wangshuo/exp/ft-en-magicoder-llama-2-13b/ckpts/checkpoints/epoch_2_hf"
+
+
+# Mistral-7B
+## base
+mistral_7b = "/home/wanghanqing/projects/models/model_ver2/Mistral-7B-v0.1"
+## finetuned
+mistral_7b_instruct_v1 = "/home/wanghanqing/projects/models/model_ver2/Mistral-7B-Instruct-v0.1"
+mistral_7b_instruct_v2 = "/home/wanghanqing/projects/models/model_ver2/Mistral-7B-Instruct-v0.2"
+
+# llama2-7b
+## base
+llama2_7b = "/data/public/opensource_models/meta-llama/Llama-2-7b-hf"
+## finetuned
+llama2_7b_chat = "/data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf"
+wizard_math_7b = "/data/public/opensource_models/WizardLM/WizardMath-7B-V1.0"
+meta_math_7b = "/data/public/wangshuo/exp/ft-en-metameth-llama-2-7b/ckpts/checkpoints/epoch_2_hf"
+magicoder_7b = "/data/public/wangshuo/exp/ft-en-magicoder-llama-2-7b/ckpts/checkpoints/epoch_2_hf"
+
+# llama2-13b
+## base
+llama2_13b = "/data/public/opensource_models/meta-llama/Llama-2-13b-hf"
+## finetuned
+llama2_13b_chat = "/data/public/opensource_models/meta-llama/Llama-2-13b-chat-hf"
+wizard_math_13b = "/data/public/opensource_models/WizardLM/WizardMath-13B-V1.0"
+magicoder_13b = "/data/public/wangshuo/exp/ft-en-magicoder-llama-2-13b/ckpts/checkpoints/epoch_2_hf"
+code_llama13 = "/data/public/opensource_models/codellama/codellama-13b-python-hf"
+wizard_coder = "/data/public/opensource_models/WizardLM/WizardCoder-Python-13B-V1.0"
+
+
+
+
+import sys
+
+# 打开一个日志文件
+log_file = open("analysis_log.txt", "w")
+
+# 保存原始的标准输出
+original_stdout = sys.stdout
+
+# 重定向标准输出到文件
+sys.stdout = log_file
+
+# 你的代码，所有print函数的输出都会写入log.txt
+print("This will be written to analysis_log.txt")
+
+
+
+
+
+analysis_delta(base_model_path = llama2_7b, finetuned_model_path = llama2_7b_chat)
+analysis_delta(base_model_path = llama2_7b, finetuned_model_path = wizard_math_7b)
+analysis_delta(base_model_path = llama2_7b, finetuned_model_path = meta_math_7b)
+analysis_delta(base_model_path = llama2_7b, finetuned_model_path = magicoder_7b)
+
+# 恢复原始的标准输出
+sys.stdout = original_stdout
+
+# 关闭日志文件
+log_file.close()
\ No newline at end of file
diff --git a/run.sh b/run.sh
index 8874caf..eb40a99 100644
--- a/run.sh
+++ b/run.sh
@@ -1,4 +1,4 @@
-MODEL_SAVE_DIR=save/uncalibrated_model_0
+MODEL_SAVE_DIR=save/uncalibrated_model_attn_1024_mlp_2048
 
 mkdir -p $MODEL_SAVE_DIR
 
diff --git a/run_tailor.sh b/run_tailor.sh
new file mode 100644
index 0000000..71b7393
--- /dev/null
+++ b/run_tailor.sh
@@ -0,0 +1,11 @@
+python \
+  tailor.py \
+  --finetuned_model_name /data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf \
+  --save_dir /home/pingbowen/workspace/delta-compression/BitDelta/tailor_model/7b_chat \ 
+  
+  
+# &
+
+# python3 tailor.py \
+#   --finetuned_model_name /data/public/wangshuo/exp/ft-en-metameth-llama-2-7b/ckpts/checkpoints/epoch_2_hf \
+#   --save_dir /home/pingbowen/workspace/delta-compression/BitDelta/tailor_model/math_lora_7b \
\ No newline at end of file

From 5ea7c52fb6727d31fddb01a762e4639c60270d1c Mon Sep 17 00:00:00 2001
From: pingbowen <pingbowen23@163.com>
Date: Sat, 16 Mar 2024 11:14:55 +0800
Subject: [PATCH 04/14] delta orthogonal

---
 bitdelta/diff2.py  | 60 ++++++++++++++++++++++++++++++++++++++++++++--
 bitdelta/train2.py |  4 ++--
 run.sh             | 10 ++++----
 test.py            | 31 ++++++++++++++----------
 4 files changed, 83 insertions(+), 22 deletions(-)

diff --git a/bitdelta/diff2.py b/bitdelta/diff2.py
index 56ba348..1303f27 100644
--- a/bitdelta/diff2.py
+++ b/bitdelta/diff2.py
@@ -1,7 +1,7 @@
 import torch
 import torch.nn as nn
 import gc
-
+import torch.nn.functional as F
 from bitdelta.binary_gemm_kernel import pack, unpack, binary_bmm
 from bitdelta.utils import get_model, get_tokenizer
 
@@ -46,6 +46,32 @@ def Pass(layers=None,name=None):
     return False
 
 
+def solve_orthogonal(p, f):
+    # 计算x
+    delta ,n , sacled_p = f - p, p.shape[-1],p
+
+    # import pdb; pdb.set_trace()
+    
+    for i in range(n):
+        p_i,f_i = p[:,i],f[:,i]
+        dot_fp , dot_pd = torch.dot(f_i, p_i) , torch.dot(p_i, delta[:,i])
+        
+        if dot_fp == 0 or dot_pd == 0: # p_i或f_i是零向量，因为低秩, 边界p_i与delta_i直接正交
+            continue
+        
+        dot_pp = torch.dot(p_i, p_i)
+        x = dot_fp / dot_pp if dot_pp != 0 else None
+
+        
+        # 计算(f - xp)
+        with torch.no_grad():
+            delta[:,i].data.copy_(f_i - x * p_i) if x is not None else None
+            sacled_p[:,i].data.copy_(sacled_p[:,i].data * x) if x is not None else None
+        
+    # import pdb; pdb.set_trace()
+    
+    return delta , sacled_p
+
 def compress_diff(base_model, finetuned_model, finetuned_compressed_model,save_dir,layers=None):
     def compress_submodule(name, subname, module, submodule):
         target_device = submodule.weight.device
@@ -68,6 +94,35 @@ def compress_submodule(name, subname, module, submodule):
     for name, module in finetuned_compressed_model.named_modules():
         if "self_attn" in name or "mlp" in name:
             for subname, submodule in module.named_children():
+                
+                with torch.no_grad():
+                    if "proj" in subname:
+                        p = base_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device)
+                        f = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device)
+                        dim = 128
+                        
+                        if "mlp" in name:
+                            dim = int(128 * 1.45)
+                    
+                        delta , scaled_p = solve_orthogonal(p, f)
+                        U,S,V = decomposition(delta,dim=dim)
+                        delta = U @ torch.diag(S) @ V.t()
+                        finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(scaled_p.to(p.dtype) + delta.to(p.dtype))
+
+                        '''
+                        if torch.sum(torch.abs(delta_pre)) > torch.sum(torch.abs(delta)):
+                            U,S,V = decomposition(delta,dim=128)
+                            delta = U @ torch.diag(S) @ V.t()
+                            finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(scaled_p.to(p.dtype) + delta.to(p.dtype))
+                        else:
+                            U,S,V = decomposition(delta_pre,dim=128)
+                            delta_pre = U @ torch.diag(S) @ V.t()
+                            finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(p.to(p.dtype) + delta_pre.to(p.dtype))
+                        '''
+                
+                '''
+                fp 16 + 1bit
+                
                 if "proj" in subname:
                     base_weight = base_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device)
                     finetuned_weight = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device)
@@ -89,7 +144,8 @@ def compress_submodule(name, subname, module, submodule):
                     delta = U @ torch.diag(S) @ V.t()
                     with torch.no_grad():
                         finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(base_weight + delta.to(base_weight.dtype)) 
-    
+                '''
+    finetuned_model.to(torch.bfloat16)
     finetuned_model.save_pretrained(save_dir)
 
 def save_diff(finetuned_compressed_model, save_dir,layers=None,ori_diff=None):
diff --git a/bitdelta/train2.py b/bitdelta/train2.py
index eb9d66d..e87ff5e 100644
--- a/bitdelta/train2.py
+++ b/bitdelta/train2.py
@@ -20,8 +20,8 @@
 tokenizer = get_tokenizer(args.finetuned_model)
 
 with torch.no_grad():
-    base_model = get_model(args.base_model, args.base_model_device, args.base_model_memory_map)
-    finetuned_model = get_model(args.finetuned_model, args.finetuned_model_device, args.finetuned_model_memory_map)
+    base_model = get_model(args.base_model, args.base_model_device, args.base_model_memory_map).to(torch.float32)
+    finetuned_model = get_model(args.finetuned_model, args.finetuned_model_device, args.finetuned_model_memory_map).to(torch.float32)
 
 finetuned_compressed_model = get_model(args.finetuned_model, args.finetuned_compressed_model_device, args.finetuned_compressed_model_memory_map)
 
diff --git a/run.sh b/run.sh
index eb40a99..5a7bcc6 100644
--- a/run.sh
+++ b/run.sh
@@ -1,15 +1,15 @@
-MODEL_SAVE_DIR=save/uncalibrated_model_attn_1024_mlp_2048
+MODEL_SAVE_DIR=save/uncalibrated_model_orthogonal_math
 
 mkdir -p $MODEL_SAVE_DIR
 
-CUDA_VISIBLE_DEVICES=6,7 python \
+CUDA_VISIBLE_DEVICES=5,6 python \
     bitdelta/train2.py \
     --base_model /data/public/opensource_models/meta-llama/Llama-2-7b-hf/ \
     --finetuned_model /data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/ \
     --save_dir $MODEL_SAVE_DIR \
     --batch_size 4 \
     --num_steps 200 \
-    --save_full_model True 
+    --save_full_model True \
+    # &> test.log
 
-    # --layers "layers.5."\
-    # /data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/
+    #  /data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/
diff --git a/test.py b/test.py
index 1d8fe23..7e0136f 100644
--- a/test.py
+++ b/test.py
@@ -1,10 +1,12 @@
 import argparse
 import transformers
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "7"
 import torch
 from transformers import AutoConfig, AutoModelForCausalLM
 from accelerate import infer_auto_device_map, init_empty_weights
 import torch.nn as nn
-import os
+import torch.nn.functional as F
 # from llava.model.language_model.llava_llama import LlavaConfig
 from transformers import AutoTokenizer, AutoModelForCausalLM
 # from llava.model import *
@@ -89,22 +91,25 @@ def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, d
     del base_model
 
 
-A = torch.Tensor([[1, 2, 3],[6,5,4]])
-B = torch.Tensor([[9],[9]])
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+# model = AutoModelForCausalLM.from_pretrained("/data/public/opensource_models/meta-llama/Llama-2-7b-hf/").to(device).to(torch.bfloat16)
+# k = model.get_submodule("model.layers.0.self_attn.k_proj").weight
+
+a = torch.rand(4096) / 1000
+b = torch.rand(4096) / 1000
+
+# a , b = a.to(torch.bfloat16) , b.to(torch.bfloat16)
+
+dot_fp , dot_pp = torch.dot(a, b) , torch.dot(b, b)
 
-A[:,-1:] = B
+x = dot_fp / dot_pp
 
-print(A)
-# U,S,V = torch.svd(A)
-# # print("-----------------")
+cosine_sim = F.cosine_similarity(a,b,dim=0)
 
-# print(A.shape)
-# print("-----------------")
-# print(S.shape)
-# print("-----------------")
-# print(V)
+cosine_sim2 = F.cosine_similarity(b,a - x * b,dim=0)
 
-# base_model = get_model("/home/pingbowen/models/Llava-v1-vicuna/Llava-v1/", "cuda")
+import pdb; pdb.set_trace() 
 # params = base_model.state_dict()
 
 # print(params.keys())

From 7f2339df6e27e1a03d44a9ce08baf5231dd71ad3 Mon Sep 17 00:00:00 2001
From: pingbowen <pingbowen23@163.com>
Date: Mon, 18 Mar 2024 19:22:42 +0800
Subject: [PATCH 05/14] orthogonal

---
 bitdelta/diff2.py | 30 ++++++++++++++++++++++++++++--
 run.sh            |  2 +-
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/bitdelta/diff2.py b/bitdelta/diff2.py
index 1303f27..f303ca7 100644
--- a/bitdelta/diff2.py
+++ b/bitdelta/diff2.py
@@ -72,6 +72,25 @@ def solve_orthogonal(p, f):
     
     return delta , sacled_p
 
+def get_outlier(tensor, percent=0.5):
+    # 计算保留的元素数量
+    num_elements = tensor.numel()
+    num_to_keep = int(num_elements * percent / 100)
+
+    # 展平张量并获取最大和最小的元素的索引
+    flat_tensor = tensor.flatten()
+    _, top_indices = torch.topk(flat_tensor, num_to_keep, largest=True)
+    _, bottom_indices = torch.topk(flat_tensor, num_to_keep, largest=False)
+
+    # 创建一个全零张量
+    result = torch.zeros_like(tensor)
+
+    # 仅在指定位置放置最大和最小的元素
+    result.view(-1)[top_indices] = tensor.view(-1)[top_indices]
+    result.view(-1)[bottom_indices] = tensor.view(-1)[bottom_indices]
+
+    return result
+    
 def compress_diff(base_model, finetuned_model, finetuned_compressed_model,save_dir,layers=None):
     def compress_submodule(name, subname, module, submodule):
         target_device = submodule.weight.device
@@ -99,13 +118,20 @@ def compress_submodule(name, subname, module, submodule):
                     if "proj" in subname:
                         p = base_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device)
                         f = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device)
-                        dim = 128
+                        dim , fp16_col = 1024 , 64
                         
                         if "mlp" in name:
-                            dim = int(128 * 1.45)
+                            fp16_col = 128
                     
                         delta , scaled_p = solve_orthogonal(p, f)
                         U,S,V = decomposition(delta,dim=dim)
+                        
+                        compressed_U, compressed_V = BinaryDiff(weight=U[:,fp16_col:]).to(f.device), BinaryDiff(weight=V[:,fp16_col:]).to(f.device)
+                        U_mask, U_coeff, V_mask, V_coeff = compressed_U.mask, compressed_U.coeff, compressed_V.mask, compressed_V.coeff
+                        weight_U , weight_V = (unpack(U_mask)*2-1) * U_coeff, (unpack(V_mask)*2-1) * V_coeff
+                        U[:,fp16_col:] , V[:,fp16_col:] = weight_U.T, weight_V.T 
+                        
+                        
                         delta = U @ torch.diag(S) @ V.t()
                         finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(scaled_p.to(p.dtype) + delta.to(p.dtype))
 
diff --git a/run.sh b/run.sh
index 5a7bcc6..266e5a1 100644
--- a/run.sh
+++ b/run.sh
@@ -1,4 +1,4 @@
-MODEL_SAVE_DIR=save/uncalibrated_model_orthogonal_math
+MODEL_SAVE_DIR=save/uncalibrated_model_orthogonal_mix_math
 
 mkdir -p $MODEL_SAVE_DIR
 

From 19c1d3eb225a9083f71b7c7bbc707d953ac57cd3 Mon Sep 17 00:00:00 2001
From: pingbowen <pingbowen23@163.com>
Date: Tue, 19 Mar 2024 19:16:43 +0800
Subject: [PATCH 06/14] add outlier

---
 bitdelta/diff2.py | 84 +++++++++++++++++++++++++++++++----------------
 run.sh            |  2 +-
 test.py           | 45 +++++++++++++++++++++++--
 3 files changed, 99 insertions(+), 32 deletions(-)

diff --git a/bitdelta/diff2.py b/bitdelta/diff2.py
index f303ca7..c0b6887 100644
--- a/bitdelta/diff2.py
+++ b/bitdelta/diff2.py
@@ -86,11 +86,19 @@ def get_outlier(tensor, percent=0.5):
     result = torch.zeros_like(tensor)
 
     # 仅在指定位置放置最大和最小的元素
-    result.view(-1)[top_indices] = tensor.view(-1)[top_indices]
-    result.view(-1)[bottom_indices] = tensor.view(-1)[bottom_indices]
+    result = result.flatten()
+    result[top_indices] = flat_tensor[top_indices]
+    result[bottom_indices] = flat_tensor[bottom_indices]
+    result = result.reshape(tensor.shape)
 
     return result
-    
+
+def copy_nonzero_values(A, B):
+    # 复制B中非零值到A的对应位置
+    mask = B != 0
+    A[mask] = B[mask]
+    return A
+
 def compress_diff(base_model, finetuned_model, finetuned_compressed_model,save_dir,layers=None):
     def compress_submodule(name, subname, module, submodule):
         target_device = submodule.weight.device
@@ -118,33 +126,28 @@ def compress_submodule(name, subname, module, submodule):
                     if "proj" in subname:
                         p = base_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device)
                         f = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device)
-                        dim , fp16_col = 1024 , 64
                         
-                        if "mlp" in name:
-                            fp16_col = 128
-                    
-                        delta , scaled_p = solve_orthogonal(p, f)
-                        U,S,V = decomposition(delta,dim=dim)
+                        delta , outlier_U, outlier_V = f - p , None, None
+                        dim , fp16_col = 1024, 64
                         
+                        if "self_attn" in name:
+                            U,S,V,outlier_U,outlier_V = decomposition(delta,dim=dim,name=name) 
+                        else:
+                            dim , fp16_col = 1024 , 128
+                            # delta , scaled_p = solve_orthogonal(p, f)
+                            U,S,V,outlier_U,outlier_V = decomposition(delta,dim=dim,name=name)
+                                                
                         compressed_U, compressed_V = BinaryDiff(weight=U[:,fp16_col:]).to(f.device), BinaryDiff(weight=V[:,fp16_col:]).to(f.device)
                         U_mask, U_coeff, V_mask, V_coeff = compressed_U.mask, compressed_U.coeff, compressed_V.mask, compressed_V.coeff
                         weight_U , weight_V = (unpack(U_mask)*2-1) * U_coeff, (unpack(V_mask)*2-1) * V_coeff
-                        U[:,fp16_col:] , V[:,fp16_col:] = weight_U.T, weight_V.T 
-                        
+                        U[:,fp16_col:] , V[:,fp16_col:] = weight_U.T, weight_V.T
+
+                        # import pdb; pdb.set_trace()
+                        if outlier_U is not None and outlier_V is not None:
+                            copy_nonzero_values(U[:,fp16_col:], outlier_U) , copy_nonzero_values(V[:,fp16_col:], outlier_V)  
                         
-                        delta = U @ torch.diag(S) @ V.t()
-                        finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(scaled_p.to(p.dtype) + delta.to(p.dtype))
-
-                        '''
-                        if torch.sum(torch.abs(delta_pre)) > torch.sum(torch.abs(delta)):
-                            U,S,V = decomposition(delta,dim=128)
-                            delta = U @ torch.diag(S) @ V.t()
-                            finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(scaled_p.to(p.dtype) + delta.to(p.dtype))
-                        else:
-                            U,S,V = decomposition(delta_pre,dim=128)
-                            delta_pre = U @ torch.diag(S) @ V.t()
-                            finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(p.to(p.dtype) + delta_pre.to(p.dtype))
-                        '''
+                        delta = U @ torch.diag(S) @ V.t() 
+                        finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(p.to(p.dtype) + delta.to(p.dtype))
                 
                 '''
                 fp 16 + 1bit
@@ -171,6 +174,7 @@ def compress_submodule(name, subname, module, submodule):
                     with torch.no_grad():
                         finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(base_weight + delta.to(base_weight.dtype)) 
                 '''
+    # import pdb ; pdb.set_trace()
     finetuned_model.to(torch.bfloat16)
     finetuned_model.save_pretrained(save_dir)
 
@@ -222,16 +226,40 @@ def load_diff(model, diff_dir,ori_diff):
 
     model.config.vocab_size = model.lm_head.weight.size(0)
 
-def decomposition(masked_input_tensor,dim=None,st=None,ed=None,name=None):
+def set_zero(A, B):
+    # 复制B中非零值到A的对应位置
+    mask = B != 0
+    A[mask] = 0
+    return A
+
+
+def decomposition(masked_input_tensor,dim=None,name=None):
     U , S , V = torch.svd(masked_input_tensor.to(torch.float32))
     
+    outlier_U , outlier_V = None, None
+    
     if dim is not None:
         U , S , V = U[:, :dim],S[:dim] ,V[:, :dim]
     
-    if st is not None and ed is not None:
-        U , S , V = U[:, st:ed],S[st:ed] ,V[:, st:ed]
+    if "self_attn" in name:
+        outlier_U = get_outlier(U[:,64:], percent=0.2)
+        outlier_V = get_outlier(V[:,64:], percent=0.2)
+        
+        set_zero(U[:,64:], outlier_U)
+        set_zero(V[:,64:], outlier_V)
+        
+    else:
+        outlier_U = get_outlier(U[:,128:], percent=0.1)
+        outlier_V = get_outlier(V[:,128:], percent=0.1)
+        
+        set_zero(U[:,128:], outlier_U)
+        set_zero(V[:,128:], outlier_V)
     
-    return U, S, V
+    # max_val, min_val, mean_abs_val = round(torch.max(U).item(),4), round(torch.min(U).item(),4), round(torch.mean(torch.abs(U)).item(),4)
+                            
+    # print(f"max_val {max_val} pos_min {round(torch.min(outlier[outlier > 0]).item(),4)} mean_abs_val {mean_abs_val} ratio {round(torch.min(outlier[outlier > 0]).item() / mean_abs_val,4)}")
+    # import pdb; pdb.set_trace()
+    return U, S, V , outlier_U, outlier_V
 
 def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, device,layers=None,ori_diff=None):
     base_model = get_model(base_model_name, device)
diff --git a/run.sh b/run.sh
index 266e5a1..6eceba8 100644
--- a/run.sh
+++ b/run.sh
@@ -1,4 +1,4 @@
-MODEL_SAVE_DIR=save/uncalibrated_model_orthogonal_mix_math
+MODEL_SAVE_DIR=save/uncalibrated_model
 
 mkdir -p $MODEL_SAVE_DIR
 
diff --git a/test.py b/test.py
index 7e0136f..9709030 100644
--- a/test.py
+++ b/test.py
@@ -93,9 +93,6 @@ def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, d
 
 device = "cuda" if torch.cuda.is_available() else "cpu"
 
-# model = AutoModelForCausalLM.from_pretrained("/data/public/opensource_models/meta-llama/Llama-2-7b-hf/").to(device).to(torch.bfloat16)
-# k = model.get_submodule("model.layers.0.self_attn.k_proj").weight
-
 a = torch.rand(4096) / 1000
 b = torch.rand(4096) / 1000
 
@@ -109,6 +106,48 @@ def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, d
 
 cosine_sim2 = F.cosine_similarity(b,a - x * b,dim=0)
 
+def filter_top_and_bottom_percent(tensor, percent=0.5):
+    # 计算保留的元素数量
+    num_elements = tensor.numel()
+    num_to_keep = int(num_elements * percent / 100)
+
+    # 展平张量并获取最大和最小的元素的索引
+    flat_tensor = tensor.flatten()
+    _, top_indices = torch.topk(flat_tensor, num_to_keep, largest=True)
+    _, bottom_indices = torch.topk(flat_tensor, num_to_keep, largest=False)
+
+    # 创建一个全零张量
+    result = torch.zeros_like(tensor)
+
+    # 仅在指定位置放置最大和最小的元素
+    result = result.flatten()
+    result[top_indices] = flat_tensor[top_indices]
+    result[bottom_indices] = flat_tensor[bottom_indices]
+    result = result.reshape(tensor.shape)
+
+    return result
+
+def copy_nonzero_values(A, B):
+    # 复制B中非零值到A的对应位置
+    mask = B != 0
+    A[mask] = B[mask]
+    return A
+
+
+# 示例
+n = 4
+A = torch.randn(n, n)  # 随机生成一个n × n的张量A
+B = torch.zeros(n, n)  # 创建一个n × n的全零张量B
+
+# 在B中随机设置一些非零值
+indices = torch.randint(0, n, (3, 2))  # 随机选择一些位置
+for i, j in indices:
+    B[i, j] = torch.randn(1).item()  # 随机非零值
+
+# 复制B中的非零值到A
+updated_A = copy_nonzero_values(A, B)
+
+
 import pdb; pdb.set_trace() 
 # params = base_model.state_dict()
 

From 46d46ca852d757ab0838ae071686781ad54bca09 Mon Sep 17 00:00:00 2001
From: pingbowen <pingbowen23@163.com>
Date: Wed, 20 Mar 2024 18:31:42 +0800
Subject: [PATCH 07/14] for test

---
 bitdelta/diff2.py |  9 +++++++--
 test.py           | 10 ++--------
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/bitdelta/diff2.py b/bitdelta/diff2.py
index c0b6887..e6166a4 100644
--- a/bitdelta/diff2.py
+++ b/bitdelta/diff2.py
@@ -5,6 +5,9 @@
 from bitdelta.binary_gemm_kernel import pack, unpack, binary_bmm
 from bitdelta.utils import get_model, get_tokenizer
 
+# 离群值抽出之后 原来位置设定成多少，如果设置成0会让分母增大
+# U, V
+
 class BinaryDiff(nn.Module):
     def __init__(self, weight):
         super().__init__()
@@ -142,9 +145,10 @@ def compress_submodule(name, subname, module, submodule):
                         weight_U , weight_V = (unpack(U_mask)*2-1) * U_coeff, (unpack(V_mask)*2-1) * V_coeff
                         U[:,fp16_col:] , V[:,fp16_col:] = weight_U.T, weight_V.T
 
-                        # import pdb; pdb.set_trace()
+                        
                         if outlier_U is not None and outlier_V is not None:
-                            copy_nonzero_values(U[:,fp16_col:], outlier_U) , copy_nonzero_values(V[:,fp16_col:], outlier_V)  
+                            tmp = copy_nonzero_values(U[:,fp16_col:], outlier_U) , copy_nonzero_values(V[:,fp16_col:], outlier_V) 
+                            # import pdb; pdb.set_trace() 
                         
                         delta = U @ torch.diag(S) @ V.t() 
                         finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(p.to(p.dtype) + delta.to(p.dtype))
@@ -246,6 +250,7 @@ def decomposition(masked_input_tensor,dim=None,name=None):
         outlier_V = get_outlier(V[:,64:], percent=0.2)
         
         set_zero(U[:,64:], outlier_U)
+        # import pdb; pdb.set_trace()
         set_zero(V[:,64:], outlier_V)
         
     else:
diff --git a/test.py b/test.py
index 9709030..ab02ab3 100644
--- a/test.py
+++ b/test.py
@@ -138,14 +138,8 @@ def copy_nonzero_values(A, B):
 n = 4
 A = torch.randn(n, n)  # 随机生成一个n × n的张量A
 B = torch.zeros(n, n)  # 创建一个n × n的全零张量B
-
-# 在B中随机设置一些非零值
-indices = torch.randint(0, n, (3, 2))  # 随机选择一些位置
-for i, j in indices:
-    B[i, j] = torch.randn(1).item()  # 随机非零值
-
-# 复制B中的非零值到A
-updated_A = copy_nonzero_values(A, B)
+A = A.flatten()
+values , top_indices = torch.topk(A, 1, largest=True)
 
 
 import pdb; pdb.set_trace() 

From 840417cea482888fdac0863447e0269801aaec95 Mon Sep 17 00:00:00 2001
From: pingbowen <pingbowen23@163.com>
Date: Thu, 21 Mar 2024 08:20:42 +0800
Subject: [PATCH 08/14] bitdelta outlier

---
 bitdelta/diff.py  | 75 ++++++++++++++++++++++++++++++++---------------
 bitdelta/diff2.py |  2 +-
 bitdelta/train.py |  5 ++++
 run.sh            |  2 +-
 4 files changed, 59 insertions(+), 25 deletions(-)

diff --git a/bitdelta/diff.py b/bitdelta/diff.py
index 2f97ab7..594b936 100644
--- a/bitdelta/diff.py
+++ b/bitdelta/diff.py
@@ -9,7 +9,9 @@ class BinaryDiff(nn.Module):
     def __init__(self, base, finetune):
         super().__init__()
         diff = finetune - base
-        diff = decomposition(diff, st=64, ed=1024)
+        outlier = get_outlier(diff, percent=0.02)
+        set_zero(diff, outlier)
+        # import pdb; pdb.set_trace()
         quantile = diff.float().abs().mean()
 
         mask = torch.ones_like(diff)
@@ -18,6 +20,7 @@ def __init__(self, base, finetune):
      
         self.register_buffer("mask", mask)
         self.register_buffer("base", base.T)
+        self.register_buffer("outlier", outlier)
         self.register_parameter(
             "coeff",
             nn.Parameter(
@@ -39,13 +42,38 @@ def forward(self, x):
         repeated_mask = self.mask.unsqueeze(0).repeat(x.size(0), 1, 1)
         return x @ self.base + self.coeff * binary_bmm(x, repeated_mask)
 
-def Pass(layers=None,name=None):
-    if layers is not None:
-        for layer in layers:
-            if layer in name:
-                return True
-    return False
+def set_zero(A, B):
+    # 复制B中非零值到A的对应位置
+    mask = B != 0
+    A[mask] = 0
+    return A
 
+def get_outlier(tensor, percent=0.5):
+    # 计算保留的元素数量
+    num_elements = tensor.numel()
+    num_to_keep = int(num_elements * percent / 100)
+
+    # 展平张量并获取最大和最小的元素的索引
+    flat_tensor = tensor.flatten()
+    _, top_indices = torch.topk(flat_tensor, num_to_keep, largest=True)
+    _, bottom_indices = torch.topk(flat_tensor, num_to_keep, largest=False)
+
+    # 创建一个全零张量
+    result = torch.zeros_like(tensor)
+
+    # 仅在指定位置放置最大和最小的元素
+    result = result.flatten()
+    result[top_indices] = flat_tensor[top_indices]
+    result[bottom_indices] = flat_tensor[bottom_indices]
+    result = result.reshape(tensor.shape)
+
+    return result
+
+def copy_nonzero_values(A, B):
+    # 复制B中非零值到A的对应位置
+    mask = B != 0
+    A[mask] = B[mask]
+    return A
 
 def compress_diff(base_model, finetuned_model, finetuned_compressed_model,layers=None):
     def compress_submodule(name, subname, module, submodule):
@@ -67,25 +95,26 @@ def compress_submodule(name, subname, module, submodule):
 
     # TODO: this can be parallelized
     # flag = False
-    for name, module in finetuned_compressed_model.named_modules():
-        # if flag == True:
-        #     break
-        
-        if "self_attn" in name:
-            for subname, submodule in module.named_children():
-                if "proj" in subname:
-                    compress_submodule(name, subname, module, submodule)
-        elif "mlp" in name:
-            with torch.no_grad():
+    with torch.no_grad():
+        for name, module in finetuned_model.named_modules():
+            if "self_attn" in name or "mlp" in name:
                 for subname, submodule in module.named_children():
                     if "proj" in subname:
-                        base_weight = base_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device)
-                        finetuned_weight = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device)
-                        delta = decomposition(finetuned_weight - base_weight,dim=int(128 * 1.45)) 
-                        finetuned_compressed_model.get_submodule(f"{name}.{subname}").weight.copy_(base_weight + delta.to(torch.bfloat16))
-                        # flag = True
+                        p , f = base_model.get_submodule(f"{name}.{subname}").weight.detach() , finetuned_model.get_submodule(f"{name}.{subname}").weight.detach()
+                        
+                        compressed = BinaryDiff(base=p, finetune=f)
+                        mask, coeff , outlier = compressed.mask, compressed.coeff, compressed.outlier
+                        weight = (unpack(mask)*2-1) * coeff
+                        weight = weight.T.to(outlier.dtype)
+                        
+                        copy_nonzero_values(weight, outlier)
                         # import pdb; pdb.set_trace()
-                        # break
+                        finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(p.to(p.dtype) + weight.to(p.dtype))
+    
+    finetuned_model.save_pretrained("/home/pingbowen/workspace/delta-compression/BitDelta/save/test")
+                    
+                    
+                    
 
 def save_diff(finetuned_compressed_model, save_dir,layers=None,ori_diff=None):
     diff_dict = {}
diff --git a/bitdelta/diff2.py b/bitdelta/diff2.py
index e6166a4..f60b6dd 100644
--- a/bitdelta/diff2.py
+++ b/bitdelta/diff2.py
@@ -147,7 +147,7 @@ def compress_submodule(name, subname, module, submodule):
 
                         
                         if outlier_U is not None and outlier_V is not None:
-                            tmp = copy_nonzero_values(U[:,fp16_col:], outlier_U) , copy_nonzero_values(V[:,fp16_col:], outlier_V) 
+                            copy_nonzero_values(U[:,fp16_col:], outlier_U) , copy_nonzero_values(V[:,fp16_col:], outlier_V) 
                             # import pdb; pdb.set_trace() 
                         
                         delta = U @ torch.diag(S) @ V.t() 
diff --git a/bitdelta/train.py b/bitdelta/train.py
index 9e4bf97..a4a44e5 100644
--- a/bitdelta/train.py
+++ b/bitdelta/train.py
@@ -46,6 +46,10 @@ def original_diff(base_model, finetuned_model):
 print(f"compressing diff...")
 compress_diff(base_model, finetuned_model, finetuned_compressed_model,layers=args.layers)
 
+tokenizer.save_pretrained("/home/pingbowen/workspace/delta-compression/BitDelta/save/test")
+
+
+'''
 train_num_samples = args.batch_size * args.num_steps
 train_dataset = get_dataset(
     args.dataset_name,
@@ -114,3 +118,4 @@ def original_diff(base_model, finetuned_model):
     save_full_model(args.base_model, args.finetuned_model, os.path.join(args.save_dir, "diff_untrained.pt"), os.path.join(args.save_dir, f"uncalibrated_model"), device="cpu",layers=args.layers,ori_diff=ori_diff)
     # print("saving calibrated model")
     # save_full_model(args.base_model, args.finetuned_model, os.path.join(args.save_dir, "diff.pt"), os.path.join(args.save_dir, "calibrated_model"), device="cpu")
+'''
\ No newline at end of file
diff --git a/run.sh b/run.sh
index 6eceba8..cc5a5c4 100644
--- a/run.sh
+++ b/run.sh
@@ -3,7 +3,7 @@ MODEL_SAVE_DIR=save/uncalibrated_model
 mkdir -p $MODEL_SAVE_DIR
 
 CUDA_VISIBLE_DEVICES=5,6 python \
-    bitdelta/train2.py \
+    bitdelta/train.py \
     --base_model /data/public/opensource_models/meta-llama/Llama-2-7b-hf/ \
     --finetuned_model /data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/ \
     --save_dir $MODEL_SAVE_DIR \

From 7c186f444c7f91cefd2c365956f47ec76d61aefc Mon Sep 17 00:00:00 2001
From: pingbowen <pingbowen23@163.com>
Date: Thu, 21 Mar 2024 18:33:23 +0800
Subject: [PATCH 09/14] modify

---
 run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run.sh b/run.sh
index cc5a5c4..6eceba8 100644
--- a/run.sh
+++ b/run.sh
@@ -3,7 +3,7 @@ MODEL_SAVE_DIR=save/uncalibrated_model
 mkdir -p $MODEL_SAVE_DIR
 
 CUDA_VISIBLE_DEVICES=5,6 python \
-    bitdelta/train.py \
+    bitdelta/train2.py \
     --base_model /data/public/opensource_models/meta-llama/Llama-2-7b-hf/ \
     --finetuned_model /data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/ \
     --save_dir $MODEL_SAVE_DIR \

From 8723564be15671746bcba4b4dd42989d751fd4e1 Mon Sep 17 00:00:00 2001
From: pingbowen <pingbowen23@163.com>
Date: Mon, 25 Mar 2024 15:27:16 +0800
Subject: [PATCH 10/14] outlier

---
 bitdelta/diff2.py  | 16 ++++++++--------
 bitdelta/train2.py |  6 +++---
 bitdelta/utils.py  |  2 ++
 run.sh             | 13 ++++++++++---
 4 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/bitdelta/diff2.py b/bitdelta/diff2.py
index f60b6dd..8489ccc 100644
--- a/bitdelta/diff2.py
+++ b/bitdelta/diff2.py
@@ -102,7 +102,7 @@ def copy_nonzero_values(A, B):
     A[mask] = B[mask]
     return A
 
-def compress_diff(base_model, finetuned_model, finetuned_compressed_model,save_dir,layers=None):
+def compress_diff(base_model, finetuned_model, finetuned_compressed_model,save_dir,args):
     def compress_submodule(name, subname, module, submodule):
         target_device = submodule.weight.device
                     
@@ -134,11 +134,11 @@ def compress_submodule(name, subname, module, submodule):
                         dim , fp16_col = 1024, 64
                         
                         if "self_attn" in name:
-                            U,S,V,outlier_U,outlier_V = decomposition(delta,dim=dim,name=name) 
+                            U,S,V,outlier_U,outlier_V = decomposition(delta,dim=dim,name=name,attn_outlier=args.attn_outlier) 
                         else:
                             dim , fp16_col = 1024 , 128
                             # delta , scaled_p = solve_orthogonal(p, f)
-                            U,S,V,outlier_U,outlier_V = decomposition(delta,dim=dim,name=name)
+                            U,S,V,outlier_U,outlier_V = decomposition(delta,dim=dim,name=name,mlp_outlier=args.mlp_outlier)
                                                 
                         compressed_U, compressed_V = BinaryDiff(weight=U[:,fp16_col:]).to(f.device), BinaryDiff(weight=V[:,fp16_col:]).to(f.device)
                         U_mask, U_coeff, V_mask, V_coeff = compressed_U.mask, compressed_U.coeff, compressed_V.mask, compressed_V.coeff
@@ -237,7 +237,7 @@ def set_zero(A, B):
     return A
 
 
-def decomposition(masked_input_tensor,dim=None,name=None):
+def decomposition(masked_input_tensor,dim=None,name=None,attn_outlier=0.1,mlp_outlier=0.1):
     U , S , V = torch.svd(masked_input_tensor.to(torch.float32))
     
     outlier_U , outlier_V = None, None
@@ -246,16 +246,16 @@ def decomposition(masked_input_tensor,dim=None,name=None):
         U , S , V = U[:, :dim],S[:dim] ,V[:, :dim]
     
     if "self_attn" in name:
-        outlier_U = get_outlier(U[:,64:], percent=0.2)
-        outlier_V = get_outlier(V[:,64:], percent=0.2)
+        outlier_U = get_outlier(U[:,64:], percent=attn_outlier)
+        outlier_V = get_outlier(V[:,64:], percent=attn_outlier)
         
         set_zero(U[:,64:], outlier_U)
         # import pdb; pdb.set_trace()
         set_zero(V[:,64:], outlier_V)
         
     else:
-        outlier_U = get_outlier(U[:,128:], percent=0.1)
-        outlier_V = get_outlier(V[:,128:], percent=0.1)
+        outlier_U = get_outlier(U[:,128:], percent=mlp_outlier)
+        outlier_V = get_outlier(V[:,128:], percent=mlp_outlier)
         
         set_zero(U[:,128:], outlier_U)
         set_zero(V[:,128:], outlier_V)
diff --git a/bitdelta/train2.py b/bitdelta/train2.py
index e87ff5e..e2fed61 100644
--- a/bitdelta/train2.py
+++ b/bitdelta/train2.py
@@ -20,12 +20,12 @@
 tokenizer = get_tokenizer(args.finetuned_model)
 
 with torch.no_grad():
-    base_model = get_model(args.base_model, args.base_model_device, args.base_model_memory_map).to(torch.float32)
-    finetuned_model = get_model(args.finetuned_model, args.finetuned_model_device, args.finetuned_model_memory_map).to(torch.float32)
+    base_model = get_model(args.base_model, args.base_model_device, args.base_model_memory_map)
+    finetuned_model = get_model(args.finetuned_model, args.finetuned_model_device, args.finetuned_model_memory_map)
 
 finetuned_compressed_model = get_model(args.finetuned_model, args.finetuned_compressed_model_device, args.finetuned_compressed_model_memory_map)
 
 print(f"compressing diff...")
-compress_diff(base_model, finetuned_model, finetuned_compressed_model,args.save_dir)
+compress_diff(base_model, finetuned_model, finetuned_compressed_model,args.save_dir,args)
 
 tokenizer.save_pretrained(args.save_dir)
diff --git a/bitdelta/utils.py b/bitdelta/utils.py
index 1304239..265ba27 100644
--- a/bitdelta/utils.py
+++ b/bitdelta/utils.py
@@ -26,6 +26,8 @@ def parse_args():
     parser.add_argument("--max_length", type=int, default=128)
     parser.add_argument("--save_dir", type=str, required=True)
     parser.add_argument("--train", action="store_true")
+    parser.add_argument("--attn_outlier", type=float,default=1e-4)
+    parser.add_argument("--mlp_outlier", type=float,default=1e-4)
 
     # device management
     parser.add_argument("--base_model_device", type=str, default="0")
diff --git a/run.sh b/run.sh
index 6eceba8..d9e5c0b 100644
--- a/run.sh
+++ b/run.sh
@@ -1,15 +1,22 @@
-MODEL_SAVE_DIR=save/uncalibrated_model
+MODEL_SAVE_DIR=./../save/uncalibrated_llava
 
 mkdir -p $MODEL_SAVE_DIR
 
+values=(0.05 0.2 0.4 0.5 0.75)
+
+# for value in ${values[@]}
+# do
 CUDA_VISIBLE_DEVICES=5,6 python \
     bitdelta/train2.py \
-    --base_model /data/public/opensource_models/meta-llama/Llama-2-7b-hf/ \
-    --finetuned_model /data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/ \
+    --base_model /home/pingbowen/models/vicuna-13b-v1.5 \
+    --finetuned_model /home/pingbowen/models/Llava-v1.5 \
     --save_dir $MODEL_SAVE_DIR \
     --batch_size 4 \
     --num_steps 200 \
     --save_full_model True \
+    --attn_outlier 0.2 \
+    --mlp_outlier 0.1 \
     # &> test.log
+# done
 
     #  /data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/

From 53a0fc7a00cd5f4a159952c6005e1589b32b15a0 Mon Sep 17 00:00:00 2001
From: pingbowen <pingbowen23@163.com>
Date: Sun, 31 Mar 2024 18:20:59 +0800
Subject: [PATCH 11/14] load llava

---
 bitdelta/diff2.py  |  8 ++++++--
 bitdelta/train2.py | 13 ++++++++-----
 bitdelta/utils.py  | 40 +++++++++++++++++++++++++++++++++++++++-
 run.sh             |  2 +-
 4 files changed, 54 insertions(+), 9 deletions(-)

diff --git a/bitdelta/diff2.py b/bitdelta/diff2.py
index 8489ccc..9e587d0 100644
--- a/bitdelta/diff2.py
+++ b/bitdelta/diff2.py
@@ -102,7 +102,7 @@ def copy_nonzero_values(A, B):
     A[mask] = B[mask]
     return A
 
-def compress_diff(base_model, finetuned_model, finetuned_compressed_model,save_dir,args):
+def compress_diff(base_model, finetuned_model, save_dir,args):
     def compress_submodule(name, subname, module, submodule):
         target_device = submodule.weight.device
                     
@@ -121,7 +121,11 @@ def compress_submodule(name, subname, module, submodule):
         setattr(module, subname, compressed)
 
     # TODO: 根据thresh 选择压缩比例
-    for name, module in finetuned_compressed_model.named_modules():
+    for name, module in finetuned_model.named_modules():
+        
+        if "vision" in name:
+            continue
+        
         if "self_attn" in name or "mlp" in name:
             for subname, submodule in module.named_children():
                 
diff --git a/bitdelta/train2.py b/bitdelta/train2.py
index e2fed61..47fda22 100644
--- a/bitdelta/train2.py
+++ b/bitdelta/train2.py
@@ -6,7 +6,7 @@
 from bitdelta.diff2 import compress_diff, save_diff, save_full_model
 from bitdelta.misc import find_corr_stddev
 
-from bitdelta.utils import get_model, parse_args, get_tokenizer
+from bitdelta.utils import get_model, parse_args, get_tokenizer,load_llava
 from tqdm import tqdm
 from bitdelta.data import get_dataset, get_dataloader
 
@@ -21,11 +21,14 @@
 
 with torch.no_grad():
     base_model = get_model(args.base_model, args.base_model_device, args.base_model_memory_map)
-    finetuned_model = get_model(args.finetuned_model, args.finetuned_model_device, args.finetuned_model_memory_map)
+    if "llava" in args.finetuned_model.lower():
+        finetuned_model = load_llava(args.finetuned_model,device="cuda" if torch.cuda.is_available() else "cpu")
+    else:
+        finetuned_model = get_model(args.finetuned_model, args.finetuned_model_device, args.finetuned_model_memory_map)
 
-finetuned_compressed_model = get_model(args.finetuned_model, args.finetuned_compressed_model_device, args.finetuned_compressed_model_memory_map)
 
+import pdb;pdb.set_trace()
 print(f"compressing diff...")
-compress_diff(base_model, finetuned_model, finetuned_compressed_model,args.save_dir,args)
+compress_diff(base_model, finetuned_model, args.save_dir,args)
 
-tokenizer.save_pretrained(args.save_dir)
+tokenizer.save_pretrained(args.save_dir)
\ No newline at end of file
diff --git a/bitdelta/utils.py b/bitdelta/utils.py
index 265ba27..c957552 100644
--- a/bitdelta/utils.py
+++ b/bitdelta/utils.py
@@ -1,8 +1,46 @@
 import argparse
 import transformers
 import torch
-from transformers import AutoConfig, AutoModelForCausalLM
+from transformers import AutoConfig, AutoModelForCausalLM,AutoTokenizer
 from accelerate import infer_auto_device_map, init_empty_weights
+import os
+from llava.model import *
+from llava.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+
+def load_llava(path,device):
+    tokenizer = AutoTokenizer.from_pretrained(path, use_fast=False)
+    model = LlavaLlamaForCausalLM.from_pretrained(
+        path,
+        low_cpu_mem_usage=True,
+        torch_dtype=torch.bfloat16,
+    ).to(device)
+    
+    
+    image_processor = None
+
+    if 'llava' in path.lower():
+        mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
+        mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
+        if mm_use_im_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+        if mm_use_im_start_end:
+            tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+        model.resize_token_embeddings(len(tokenizer))
+
+        vision_tower = model.get_vision_tower()
+        if not vision_tower.is_loaded:
+            vision_tower.load_model(device_map=device)
+        if device != 'auto':
+            vision_tower.to(device=device, dtype=torch.float16)
+        image_processor = vision_tower.image_processor
+
+    if hasattr(model.config, "max_sequence_length"):
+        context_len = model.config.max_sequence_length
+    else:
+        context_len = 2048
+    
+    return model
+
 
 def parse_args():
     parser = argparse.ArgumentParser(description="BitDelta")
diff --git a/run.sh b/run.sh
index d9e5c0b..9c9f591 100644
--- a/run.sh
+++ b/run.sh
@@ -1,4 +1,4 @@
-MODEL_SAVE_DIR=./../save/uncalibrated_llava
+MODEL_SAVE_DIR=./../save/llama_7b_chat_attn_mlp_outlier_0.2_0.1/
 
 mkdir -p $MODEL_SAVE_DIR
 

From 85ea71f64033bb1e7718ee92a7f25948469d14d2 Mon Sep 17 00:00:00 2001
From: pingbowen <pingbowen23@163.com>
Date: Sun, 31 Mar 2024 19:32:39 +0800
Subject: [PATCH 12/14] support mix and low_bit

---
 bitdelta/diff2.py  | 70 +++++++++++++++++-----------------------------
 bitdelta/train2.py |  2 +-
 bitdelta/utils.py  |  2 +-
 run.sh             |  9 ++++--
 4 files changed, 34 insertions(+), 49 deletions(-)

diff --git a/bitdelta/diff2.py b/bitdelta/diff2.py
index 9e587d0..7cf946f 100644
--- a/bitdelta/diff2.py
+++ b/bitdelta/diff2.py
@@ -135,53 +135,35 @@ def compress_submodule(name, subname, module, submodule):
                         f = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device)
                         
                         delta , outlier_U, outlier_V = f - p , None, None
-                        dim , fp16_col = 1024, 64
                         
-                        if "self_attn" in name:
-                            U,S,V,outlier_U,outlier_V = decomposition(delta,dim=dim,name=name,attn_outlier=args.attn_outlier) 
-                        else:
-                            dim , fp16_col = 1024 , 128
-                            # delta , scaled_p = solve_orthogonal(p, f)
-                            U,S,V,outlier_U,outlier_V = decomposition(delta,dim=dim,name=name,mlp_outlier=args.mlp_outlier)
-                                                
-                        compressed_U, compressed_V = BinaryDiff(weight=U[:,fp16_col:]).to(f.device), BinaryDiff(weight=V[:,fp16_col:]).to(f.device)
-                        U_mask, U_coeff, V_mask, V_coeff = compressed_U.mask, compressed_U.coeff, compressed_V.mask, compressed_V.coeff
-                        weight_U , weight_V = (unpack(U_mask)*2-1) * U_coeff, (unpack(V_mask)*2-1) * V_coeff
-                        U[:,fp16_col:] , V[:,fp16_col:] = weight_U.T, weight_V.T
+                        if args.choice == "mix":
+                            dim , fp16_col = 1024, 64
+                            
+                            if "self_attn" in name:
+                                U,S,V,outlier_U,outlier_V = decomposition(delta,dim=dim,name=name,attn_outlier=args.attn_outlier) 
+                            else:
+                                dim , fp16_col = 1024 , 128
+                                # delta , scaled_p = solve_orthogonal(p, f)
+                                U,S,V,outlier_U,outlier_V = decomposition(delta,dim=dim,name=name,mlp_outlier=args.mlp_outlier)
+                                                    
+                            compressed_U, compressed_V = BinaryDiff(weight=U[:,fp16_col:]).to(f.device), BinaryDiff(weight=V[:,fp16_col:]).to(f.device)
+                            U_mask, U_coeff, V_mask, V_coeff = compressed_U.mask, compressed_U.coeff, compressed_V.mask, compressed_V.coeff
+                            weight_U , weight_V = (unpack(U_mask)*2-1) * U_coeff, (unpack(V_mask)*2-1) * V_coeff
+                            U[:,fp16_col:] , V[:,fp16_col:] = weight_U.T, weight_V.T
 
-                        
-                        if outlier_U is not None and outlier_V is not None:
-                            copy_nonzero_values(U[:,fp16_col:], outlier_U) , copy_nonzero_values(V[:,fp16_col:], outlier_V) 
-                            # import pdb; pdb.set_trace() 
-                        
-                        delta = U @ torch.diag(S) @ V.t() 
+                            
+                            if outlier_U is not None and outlier_V is not None:
+                                copy_nonzero_values(U[:,fp16_col:], outlier_U) , copy_nonzero_values(V[:,fp16_col:], outlier_V) 
+                                # import pdb; pdb.set_trace() 
+                            
+                            delta = U @ torch.diag(S) @ V.t()
+                        elif args.choice == "bit":
+                            compressed = BinaryDiff(weight=delta).to(f.device)
+                            mask , coeff = compressed.mask, compressed.coeff
+                            delta = (unpack(mask)*2-1) * coeff
+                            delta = delta.T
+                            
                         finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(p.to(p.dtype) + delta.to(p.dtype))
-                
-                '''
-                fp 16 + 1bit
-                
-                if "proj" in subname:
-                    base_weight = base_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device)
-                    finetuned_weight = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device)
-                    dim , thresh = 1024,0.7
-                    
-                    if "mlp" in name:
-                        dim , thresh = 2048 , 0.24
-                    
-                    U,S,V = decomposition(finetuned_weight - base_weight,dim=dim)
-                    energy_total = torch.sum(S**2)
-                    energy_top_percent = torch.sum(S[:50]**2)
-                    ratio = energy_top_percent / energy_total
-                    
-                    compressed_U, compressed_V = BinaryDiff(weight=U[:,64:]).to(finetuned_weight.device), BinaryDiff(weight=V[:,64:]).to(finetuned_weight.device)
-                    U_mask, U_coeff, V_mask, V_coeff = compressed_U.mask, compressed_U.coeff, compressed_V.mask, compressed_V.coeff
-                    weight_U , weight_V = (unpack(U_mask)*2-1) * U_coeff, (unpack(V_mask)*2-1) * V_coeff
-                    # import pdb; pdb.set_trace()
-                    U[:,64:] , V[:,64:] = weight_U.T, weight_V.T   # 不确定是否有bug
-                    delta = U @ torch.diag(S) @ V.t()
-                    with torch.no_grad():
-                        finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(base_weight + delta.to(base_weight.dtype)) 
-                '''
     # import pdb ; pdb.set_trace()
     finetuned_model.to(torch.bfloat16)
     finetuned_model.save_pretrained(save_dir)
diff --git a/bitdelta/train2.py b/bitdelta/train2.py
index 47fda22..bc2f7e5 100644
--- a/bitdelta/train2.py
+++ b/bitdelta/train2.py
@@ -27,7 +27,7 @@
         finetuned_model = get_model(args.finetuned_model, args.finetuned_model_device, args.finetuned_model_memory_map)
 
 
-import pdb;pdb.set_trace()
+# import pdb;pdb.set_trace()
 print(f"compressing diff...")
 compress_diff(base_model, finetuned_model, args.save_dir,args)
 
diff --git a/bitdelta/utils.py b/bitdelta/utils.py
index c957552..18ea947 100644
--- a/bitdelta/utils.py
+++ b/bitdelta/utils.py
@@ -66,6 +66,7 @@ def parse_args():
     parser.add_argument("--train", action="store_true")
     parser.add_argument("--attn_outlier", type=float,default=1e-4)
     parser.add_argument("--mlp_outlier", type=float,default=1e-4)
+    parser.add_argument("--choice", type=str,choices=['mix','bit','rank'],default=None)
 
     # device management
     parser.add_argument("--base_model_device", type=str, default="0")
@@ -144,7 +145,6 @@ def get_model(model_name, device, memory_map=None):
     else: # single-gpu or cpu
         return transformers.AutoModelForCausalLM.from_pretrained(
             model_name,
-            # torch_dtype=torch.float16,
             torch_dtype=torch.bfloat16,
             low_cpu_mem_usage=True,
         ).to(device)
diff --git a/run.sh b/run.sh
index 9c9f591..42d5b26 100644
--- a/run.sh
+++ b/run.sh
@@ -1,4 +1,4 @@
-MODEL_SAVE_DIR=./../save/llama_7b_chat_attn_mlp_outlier_0.2_0.1/
+MODEL_SAVE_DIR=./../save/test
 
 mkdir -p $MODEL_SAVE_DIR
 
@@ -8,15 +8,18 @@ values=(0.05 0.2 0.4 0.5 0.75)
 # do
 CUDA_VISIBLE_DEVICES=5,6 python \
     bitdelta/train2.py \
-    --base_model /home/pingbowen/models/vicuna-13b-v1.5 \
-    --finetuned_model /home/pingbowen/models/Llava-v1.5 \
+    --base_model /data/public/opensource_models/meta-llama/Llama-2-7b-hf/ \
+    --finetuned_model  /data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/ \
     --save_dir $MODEL_SAVE_DIR \
     --batch_size 4 \
     --num_steps 200 \
     --save_full_model True \
     --attn_outlier 0.2 \
     --mlp_outlier 0.1 \
+    --choice bit
     # &> test.log
 # done
 
     #  /data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/
+    #  /home/pingbowen/models/vicuna-13b-v1.5 , /home/pingbowen/models/Llava-v1.5
+    # /data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/

From f93952c6d7477f4913b4ae5a7a7a400eabe6bdf3 Mon Sep 17 00:00:00 2001
From: pingbowen <pingbowen23@163.com>
Date: Mon, 29 Apr 2024 16:38:34 +0800
Subject: [PATCH 13/14] support svd

---
 bitdelta/diff2.py  |  51 ++++++++++----
 bitdelta/train2.py |   1 -
 bitdelta/utils.py  |  11 +--
 run.sh             |  28 +++++---
 run_tailor.sh      |  13 ++--
 tailor.py          | 163 ++++++++-------------------------------------
 test.py            |  43 ++++++++----
 7 files changed, 126 insertions(+), 184 deletions(-)

diff --git a/bitdelta/diff2.py b/bitdelta/diff2.py
index 7cf946f..ab356d3 100644
--- a/bitdelta/diff2.py
+++ b/bitdelta/diff2.py
@@ -121,8 +121,8 @@ def compress_submodule(name, subname, module, submodule):
         setattr(module, subname, compressed)
 
     # TODO: 根据thresh 选择压缩比例
+    param_dict = dict()
     for name, module in finetuned_model.named_modules():
-        
         if "vision" in name:
             continue
         
@@ -162,9 +162,32 @@ def compress_submodule(name, subname, module, submodule):
                             mask , coeff = compressed.mask, compressed.coeff
                             delta = (unpack(mask)*2-1) * coeff
                             delta = delta.T
+                        elif args.choice == "svd":
+                            dim = 1024
+                            
+                            if "mlp" in name:
+                                dim = int(1024 * 1.45)
+        
+                            U , S , V = decomposition((f - p).clone().detach(),dim=dim)
+                            param_dict[f"{name}.{subname}" + ".base"] = p
+                            param_dict[f"{name}.{subname}" + ".U"] = U.to(p.dtype)
+                            param_dict[f"{name}.{subname}" + ".S"] = S.to(p.dtype)
+                            param_dict[f"{name}.{subname}" + ".V"] = V.to(p.dtype)                            
+                            # if "llava" in args.finetuned_model.lower():
+                            #     U , S , V = decomposition((f - p).clone().detach(),dim=1024)
+                            #     param_dict[f"{name}.{subname}" + ".base"] = p
+                            #     param_dict[f"{name}.{subname}" + ".U"] = U.to(p.dtype)
+                            #     param_dict[f"{name}.{subname}" + ".S"] = S.to(p.dtype)
+                            #     param_dict[f"{name}.{subname}" + ".V"] = V.to(p.dtype)
                             
                         finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(p.to(p.dtype) + delta.to(p.dtype))
-    # import pdb ; pdb.set_trace()
+    
+    # if "llava" in args.finetuned_model.lower():
+    #     torch.save(param_dict, "/home/pingbowen/workspace/delta-compression/saved_model/llava_svd.pt")                     
+    if args.choice == "svd":
+        torch.save(param_dict, args.svd_dict)
+    
+    
     finetuned_model.to(torch.bfloat16)
     finetuned_model.save_pretrained(save_dir)
 
@@ -231,26 +254,26 @@ def decomposition(masked_input_tensor,dim=None,name=None,attn_outlier=0.1,mlp_ou
     if dim is not None:
         U , S , V = U[:, :dim],S[:dim] ,V[:, :dim]
     
-    if "self_attn" in name:
-        outlier_U = get_outlier(U[:,64:], percent=attn_outlier)
-        outlier_V = get_outlier(V[:,64:], percent=attn_outlier)
+    # if "self_attn" in name:
+    #     outlier_U = get_outlier(U[:,64:], percent=attn_outlier)
+    #     outlier_V = get_outlier(V[:,64:], percent=attn_outlier)
         
-        set_zero(U[:,64:], outlier_U)
-        # import pdb; pdb.set_trace()
-        set_zero(V[:,64:], outlier_V)
+    #     set_zero(U[:,64:], outlier_U)
+    #     # import pdb; pdb.set_trace()
+    #     set_zero(V[:,64:], outlier_V)
         
-    else:
-        outlier_U = get_outlier(U[:,128:], percent=mlp_outlier)
-        outlier_V = get_outlier(V[:,128:], percent=mlp_outlier)
+    # else:
+    #     outlier_U = get_outlier(U[:,128:], percent=mlp_outlier)
+    #     outlier_V = get_outlier(V[:,128:], percent=mlp_outlier)
         
-        set_zero(U[:,128:], outlier_U)
-        set_zero(V[:,128:], outlier_V)
+    #     set_zero(U[:,128:], outlier_U)
+    #     set_zero(V[:,128:], outlier_V)
     
     # max_val, min_val, mean_abs_val = round(torch.max(U).item(),4), round(torch.min(U).item(),4), round(torch.mean(torch.abs(U)).item(),4)
                             
     # print(f"max_val {max_val} pos_min {round(torch.min(outlier[outlier > 0]).item(),4)} mean_abs_val {mean_abs_val} ratio {round(torch.min(outlier[outlier > 0]).item() / mean_abs_val,4)}")
     # import pdb; pdb.set_trace()
-    return U, S, V , outlier_U, outlier_V
+    return U, S, V # , outlier_U, outlier_V
 
 def save_full_model(base_model_name, finetuned_model_name, diff_dir, save_dir, device,layers=None,ori_diff=None):
     base_model = get_model(base_model_name, device)
diff --git a/bitdelta/train2.py b/bitdelta/train2.py
index bc2f7e5..7073a57 100644
--- a/bitdelta/train2.py
+++ b/bitdelta/train2.py
@@ -27,7 +27,6 @@
         finetuned_model = get_model(args.finetuned_model, args.finetuned_model_device, args.finetuned_model_memory_map)
 
 
-# import pdb;pdb.set_trace()
 print(f"compressing diff...")
 compress_diff(base_model, finetuned_model, args.save_dir,args)
 
diff --git a/bitdelta/utils.py b/bitdelta/utils.py
index 18ea947..5e59508 100644
--- a/bitdelta/utils.py
+++ b/bitdelta/utils.py
@@ -4,8 +4,10 @@
 from transformers import AutoConfig, AutoModelForCausalLM,AutoTokenizer
 from accelerate import infer_auto_device_map, init_empty_weights
 import os
-from llava.model import *
-from llava.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+try:
+    from llava.model import *
+    from llava.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+except: pass
 
 def load_llava(path,device):
     tokenizer = AutoTokenizer.from_pretrained(path, use_fast=False)
@@ -44,7 +46,7 @@ def load_llava(path,device):
 
 def parse_args():
     parser = argparse.ArgumentParser(description="BitDelta")
-    
+    #
     # models
     parser.add_argument(
         "--finetuned_model", type=str, default="lmsys/vicuna-7b-v1.5-16k"
@@ -52,6 +54,7 @@ def parse_args():
     parser.add_argument("--base_model", type=str, default="meta-llama/Llama-2-7b-hf")
 
     # train params
+    parser.add_argument("--svd_dict", type=str, default="")
     parser.add_argument("--dataset_name", type=str, default="c4")
     parser.add_argument("--subset", type=str, default="en")
     parser.add_argument("--data_dir", type=str, default="en")
@@ -66,7 +69,7 @@ def parse_args():
     parser.add_argument("--train", action="store_true")
     parser.add_argument("--attn_outlier", type=float,default=1e-4)
     parser.add_argument("--mlp_outlier", type=float,default=1e-4)
-    parser.add_argument("--choice", type=str,choices=['mix','bit','rank'],default=None)
+    parser.add_argument("--choice", type=str,choices=['mix','bit','svd'],default=None)
 
     # device management
     parser.add_argument("--base_model_device", type=str, default="0")
diff --git a/run.sh b/run.sh
index 42d5b26..9fe3cbf 100644
--- a/run.sh
+++ b/run.sh
@@ -1,25 +1,33 @@
-MODEL_SAVE_DIR=./../save/test
+MODEL_SAVE_DIR=/home/pingbowen/workspace/delta-compression/save/test
 
 mkdir -p $MODEL_SAVE_DIR
 
 values=(0.05 0.2 0.4 0.5 0.75)
 
-# for value in ${values[@]}
-# do
-CUDA_VISIBLE_DEVICES=5,6 python \
+pretrained_model=(/data/public/opensource_models/codellama/codellama-7b-python-hf/ /data/public/opensource_models/meta-llama/Llama-2-7b-hf/)
+finetuned_model=(/data/groups/QY_LLM_Other/OSS_Code_LLM/Magicoder-S-CL-7B/ /data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/)
+svd_dict=(/home/pingbowen/workspace/delta-compression/saved_model/magicoder_svd.pt /home/pingbowen/workspace/delta-compression/saved_model/llama_chat_svd.pt)
+for (( i=0; i<2; i++ )); do
+
+gpu0=$((2 * i))
+gpu1=$((2 * i + 1))
+
+CUDA_VISIBLE_DEVICES="$gpu0,$gpu1" python \
     bitdelta/train2.py \
-    --base_model /data/public/opensource_models/meta-llama/Llama-2-7b-hf/ \
-    --finetuned_model  /data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/ \
+    --base_model ${pretrained_model[$i]} \
+    --finetuned_model ${finetuned_model[$i]} \
     --save_dir $MODEL_SAVE_DIR \
     --batch_size 4 \
     --num_steps 200 \
     --save_full_model True \
     --attn_outlier 0.2 \
     --mlp_outlier 0.1 \
-    --choice bit
+    --svd_dict ${svd_dict[$i]} \
+    --choice svd &
     # &> test.log
-# done
-
-    #  /data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/
+done
+wait
+    #  /data/public/opensource_models/codellama/codellama-7b-python-hf/ /data/groups/QY_LLM_Other/OSS_Code_LLM/Magicoder-S-CL-7B/
     #  /home/pingbowen/models/vicuna-13b-v1.5 , /home/pingbowen/models/Llava-v1.5
     # /data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/
+    # /data/public/opensource_models/meta-llama/Llama-2-7b-hf/ /data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/
diff --git a/run_tailor.sh b/run_tailor.sh
index 71b7393..5149cfc 100644
--- a/run_tailor.sh
+++ b/run_tailor.sh
@@ -1,11 +1,14 @@
-python \
-  tailor.py \
-  --finetuned_model_name /data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf \
-  --save_dir /home/pingbowen/workspace/delta-compression/BitDelta/tailor_model/7b_chat \ 
+CUDA_VISIBLE_DEVICES=2,3 python tailor.py \
+  --pretrained_model_name /data/public/opensource_models/meta-llama/Llama-2-7b-hf/ \
+  --finetuned_model_name /data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/\
+  --dim 128 \
+  --scale_factor 1.45 \
+  --save_dir /home/pingbowen/save/Llama-2-7b-chat_svd
   
   
 # &
-
+# /data/public/opensource_models/codellama/codellama-7b-python-hf/
+# /data/groups/QY_LLM_Other/OSS_Code_LLM/Magicoder-S-CL-7B/
 # python3 tailor.py \
 #   --finetuned_model_name /data/public/wangshuo/exp/ft-en-metameth-llama-2-7b/ckpts/checkpoints/epoch_2_hf \
 #   --save_dir /home/pingbowen/workspace/delta-compression/BitDelta/tailor_model/math_lora_7b \
\ No newline at end of file
diff --git a/tailor.py b/tailor.py
index 270bdaa..74d2d04 100755
--- a/tailor.py
+++ b/tailor.py
@@ -15,22 +15,26 @@
 import re
 import random
 import numpy as np
-import math
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--finetuned_model_name', type=str, required=True, help='finetuned model name')
-parser.add_argument('--save_dir', type=str, required=True, help='finetuned model name')
+parser.add_argument('--pretrained_model_name', type=str, help='pretrained model name')
+parser.add_argument('--finetuned_model_name', type=str,  help='finetuned model name')
+parser.add_argument('--save_dir', type=str,  help='finetuned model name')
+parser.add_argument('--dim', type=int,  help='finetuned model name')
+parser.add_argument('--scale_factor', type=float, default=1.45, help='finetuned model name')
 args = parser.parse_args()
 
-pretrained_model_name = "/data/public/opensource_models/meta-llama/Llama-2-7b-hf"
+device = "cuda" if torch.cuda.is_available() else "cpu"
 
-finetuned_model_name = args.finetuned_model_name # /data/public/wangshuo/exp/ft-en-magicoder-llama-2-7b/ckpts/checkpoints/epoch_2_hf
-pretrained_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=pretrained_model_name, 
-                                        device_map="cpu")
+pretrained_model_name = args.pretrained_model_name 
+
+finetuned_model_name = args.finetuned_model_name 
+pretrained_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=pretrained_model_name,torch_dtype=torch.bfloat16).to(device)
 pretrained_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=pretrained_model_name)
-finetuned_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=finetuned_model_name, 
-                                     device_map="cpu")
+
+finetuned_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=finetuned_model_name,torch_dtype=torch.bfloat16).to(device)
 finetuned_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=finetuned_model_name)
+
 def set_random_seed(seed: int = 0):
     """
     set random seed
@@ -46,145 +50,30 @@ def set_random_seed(seed: int = 0):
     torch.backends.cudnn.benchmark = False
 
 set_random_seed(seed=0)
-# scale_factor = finetuned_model.config.intermediate_size / finetuned_model.config.hidden_size
-
-
-scale_factor = 1.45
-def get_param_names_to_merge(input_param_names: list, exclude_param_names_regex: list):
-    """
-    get the names of parameters that need to be merged
-    :param input_param_names: list, names of input parameters
-    :param exclude_param_names_regex: list, regular expression of names of parameters that need to be excluded
-    :return:
-    """
-    param_names_to_merge = []
-    for param_name in input_param_names:
-        exclude = any([re.match(exclude_pattern, param_name) for exclude_pattern in exclude_param_names_regex])
-        if not exclude:
-            param_names_to_merge.append(param_name)
-    return param_names_to_merge
-
-
-# import pdb
-# pdb.set_trace()
+scale_factor = args.scale_factor
 
 def decomposition(masked_input_tensor,dim):
 
-    U , S , V = torch.svd(masked_input_tensor)
+    U , S , V = torch.svd(masked_input_tensor.to(torch.float32))
     U , S , V = U[:, :dim],S[:dim],V[:, :dim]
     # return torch.mm(U, torch.diag(S)), V.t()
-    return torch.mm(U, torch.mm(torch.diag(S), V.t()))   #return lora_B, lora_A
+    return U @ torch.diag(S) @ V.t()   #return lora_B, lora_A
 
-# dim = 1024
-dim = 128
-# dim = 64
-print("----------------------dim: ",dim)
-print("----------------------dim: ",dim)
-print("----------------------dim: ",dim)
-print("----------------------dim: ",dim)
-print("----------------------dim: ",dim)
-print("----------------------dim: ",dim)
 
-peft_dict = {}
-malign_dict = {}
-other_dict = {}
-
-task_vector_param_dict = {}
-pretrained_param_dict = {param_name: param_value for param_name, param_value in pretrained_model.named_parameters()}
-finetuned_param_dict = {param_name: param_value for param_name, param_value in finetuned_model.named_parameters()}
-param_names_to_merge = get_param_names_to_merge(input_param_names=list(pretrained_param_dict.keys()), exclude_param_names_regex=[])
 with torch.no_grad():
-    for param_name in param_names_to_merge:
-        if "self_attn" in param_name or "mlp" in param_name:
-            # import pdb ;pdb.set_trace()
-            if "mlp" in param_name:
-                dim = math.ceil(dim * scale_factor)
-            
-            delta = decomposition(finetuned_param_dict[param_name] - pretrained_param_dict[param_name],dim=dim)
-            finetuned_model.get_submodule(param_name.replace(".weight", "")).weight.copy_(pretrained_model.get_submodule(param_name.replace(".weight", "")).weight + delta)
-            # print(f"name {param_name} data {task_vector_param_dict[param_name]} ") 
-
+    for k,v in finetuned_model.state_dict().items():
+        dim = args.dim
+        if ".weight" in k:
+            if "self_attn" in k or "mlp" in k:
+                if "mlp" in k:
+                    dim = int(dim * scale_factor)
+                p = pretrained_model.get_submodule(k.replace(".weight", "")).weight
+                delta = decomposition(v - p,dim).to(v.dtype)
+                # import pdb; pdb.set_trace()
+                finetuned_model.get_submodule(k.replace(".weight", "")).weight.copy_(p + delta)
 
 finetuned_model.save_pretrained(save_directory=args.save_dir)
 finetuned_tokenizer.save_pretrained(save_directory=args.save_dir)
 
-# for param_name, param_value in tqdm(task_vector_param_dict.items()):
-#     if "self_attn" in param_name or "mlp" in param_name:
-#         lora_B, lora_A = decomposition(param_value,dim=dim)
-#         lora_A = lora_A * (dim/16)  ###补偿scaling, 以后的alpha可以统一为16
-#         peft_key = "base_model.model." + param_name.split(".weight")[0]
-#         print(peft_key+".lora_A.weight")
-#         peft_dict[peft_key+".lora_A.weight"] = lora_A.contiguous()
-#         peft_dict[peft_key+".lora_B.weight"] = lora_B.contiguous()
-
-
-# other_dict = {k: v.to(torch.float16) for k, v in other_dict.items()}
-
-# other_para_path = "/home/wanghanqing/projects/exp/mAlign_exp/lang_LoRAs/peft_ver/trim_lora/code/other_param"
-# torch.save(other_dict, os.path.join(other_para_path, "other.pt"))
-# torch.save(other_dict, os.path.join(other_para_path, "pretrain_other.pt"))
-
-
-peft_dict = {k: v.to(torch.float16) for k, v in peft_dict.items()}
-
-# layernum = 40
-# for lnum in range(layernum):
-#     peft_pfx = f"base_model.model.model.layers.{lnum}"
-#     delta_pfx = f"encoder.layers.{lnum}" 
-#     malign_dict[f"{delta_pfx}.self_att.self_attention.project_q_lora.lora_A.weight"] = peft_dict[f"{peft_pfx}.self_attn.q_proj.lora_A.weight"].contiguous()
-#     malign_dict[f"{delta_pfx}.self_att.self_attention.project_q_lora.lora_B.weight"] = peft_dict[f"{peft_pfx}.self_attn.q_proj.lora_B.weight"].contiguous()
-#     malign_dict[f"{delta_pfx}.self_att.self_attention.project_k_lora.lora_A.weight"] = peft_dict[f"{peft_pfx}.self_attn.k_proj.lora_A.weight"].contiguous()
-#     malign_dict[f"{delta_pfx}.self_att.self_attention.project_k_lora.lora_B.weight"] = peft_dict[f"{peft_pfx}.self_attn.k_proj.lora_B.weight"].contiguous()
-#     malign_dict[f"{delta_pfx}.self_att.self_attention.project_v_lora.lora_A.weight"] = peft_dict[f"{peft_pfx}.self_attn.v_proj.lora_A.weight"].contiguous()
-#     malign_dict[f"{delta_pfx}.self_att.self_attention.project_v_lora.lora_B.weight"] = peft_dict[f"{peft_pfx}.self_attn.v_proj.lora_B.weight"].contiguous()
-#     malign_dict[f"{delta_pfx}.self_att.self_attention.attention_out_lora.lora_A.weight"] = peft_dict[f"{peft_pfx}.self_attn.o_proj.lora_A.weight"].contiguous()
-#     malign_dict[f"{delta_pfx}.self_att.self_attention.attention_out_lora.lora_B.weight"] = peft_dict[f"{peft_pfx}.self_attn.o_proj.lora_B.weight"].contiguous()
-#     malign_dict[f"{delta_pfx}.ffn.ffn.w_in.w_0_lora.lora_A.weight"] = peft_dict[f"{peft_pfx}.mlp.gate_proj.lora_A.weight"].contiguous()
-#     malign_dict[f"{delta_pfx}.ffn.ffn.w_in.w_0_lora.lora_B.weight"] = peft_dict[f"{peft_pfx}.mlp.gate_proj.lora_B.weight"].contiguous()
-#     malign_dict[f"{delta_pfx}.ffn.ffn.w_in.w_1_lora.lora_A.weight"] = peft_dict[f"{peft_pfx}.mlp.up_proj.lora_A.weight"].contiguous()
-#     malign_dict[f"{delta_pfx}.ffn.ffn.w_in.w_1_lora.lora_B.weight"] = peft_dict[f"{peft_pfx}.mlp.up_proj.lora_B.weight"].contiguous()
-#     malign_dict[f"{delta_pfx}.ffn.ffn.w_out_lora.lora_A.weight"] = peft_dict[f"{peft_pfx}.mlp.down_proj.lora_A.weight"].contiguous()
-#     malign_dict[f"{delta_pfx}.ffn.ffn.w_out_lora.lora_B.weight"] = peft_dict[f"{peft_pfx}.mlp.down_proj.lora_B.weight"].contiguous()
-
-
-
-
-
-malign_dict = {k: v.to(torch.float16) for k, v in malign_dict.items()}
-
-# import pdb
-# pdb.set_trace()
-
-output_peft_path = "/home/wanghanqing/projects/exp/mAlign_exp/lang_LoRAs/peft_ver/trim_lora/dim256_2/code"
-output_malign_path = "/home/wanghanqing/projects/exp/mAlign_exp/mAlign_LoRAs/trim_lora/dim256_2/code"
-
-# torch.save(peft_dict, os.path.join(output_peft_path, "adapter_model.bin"))
-# torch.save(malign_dict, os.path.join(output_malign_path, "lora.pt"))
-
 
 print("--end--")
-
-
-
-
-
-# num , masked_input_tensor = 0,input_tensor
-# if "self_attn" in param_name or "mlp" in param_name:
-#     if "mlp" in param_name:
-#             dim = math.ceil(dim * scale_factor)
-#             thresh_hold = 0.06752
-#     num, masked_input_tensor = decomposition(input_tensor,dim=dim)     
-
-
-
-
-
-# for param_name, param_value in finetuned_model.named_parameters():
-#     if param_name in masked_param_dict:
-#         param_value.data.copy_(masked_param_dict[param_name])
-
-# logger.info(f"saving model at {save_model_path}...")
-# os.makedirs(save_model_path, exist_ok=True)
-# finetuned_model.save_pretrained(save_directory=save_model_path)
-# finetuned_tokenizer.save_pretrained(save_directory=save_model_path)
-# logger.info(f"model is saved")
\ No newline at end of file
diff --git a/test.py b/test.py
index ab02ab3..500e631 100644
--- a/test.py
+++ b/test.py
@@ -133,19 +133,36 @@ def copy_nonzero_values(A, B):
     A[mask] = B[mask]
     return A
 
-
-# 示例
-n = 4
-A = torch.randn(n, n)  # 随机生成一个n × n的张量A
-B = torch.zeros(n, n)  # 创建一个n × n的全零张量B
-A = A.flatten()
-values , top_indices = torch.topk(A, 1, largest=True)
-
-
-import pdb; pdb.set_trace() 
-# params = base_model.state_dict()
-
-# print(params.keys())
+def load_svd(model):
+    param_dict = torch.load("/home/pingbowen/workspace/delta-compression/saved_model/llava_svd.pt")
+    
+    with torch.no_grad():
+        for k,v in param_dict.items():
+            if "base" in k:
+                dim = args.dim
+                
+                if "mlp" in k:
+                    dim = int(dim * 1.45)
+                
+                k = k.replace(".base", "")
+                
+                U = param_dict[k + ".U"][:, :dim]
+                S = param_dict[k + ".S"][:dim]
+                V = param_dict[k + ".V"][:, :dim]
+                # import pdb; pdb.set_trace()
+                model.get_submodule(k).weight.copy_(v + U @ torch.diag(S) @ V.t())
+
+parser = argparse.ArgumentParser(description="BitDelta")
+parser.add_argument("--dim", type=int, default=128)
+args = parser.parse_args()
+
+tokenizer = AutoTokenizer.from_pretrained("/data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/")
+model = AutoModelForCausalLM.from_pretrained("/data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/", low_cpu_mem_usage=True, torch_dtype=torch.bfloat16)
+
+load_svd(model)
+
+tokenizer.save_pretrained(f"/home/pingbowen/workspace/delta-compression/save/Llama-chat-svd_{args.dim}/")
+model.save_pretrained(f"/home/pingbowen/workspace/delta-compression/save/Llama-chat-svd_{args.dim}/")
 
 # get_tokenizer("/data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/")
 # save_full_model("/data/public/opensource_models/meta-llama/Llama-2-7b-hf/", "/data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/", os.path.join("/home/pingbowen/workspace/delta-compression/BitDelta/save", "diff_untrained.pt"), os.path.join("/home/pingbowen/workspace/delta-compression/BitDelta/save", "uncalibrated_model"), device="cuda")
\ No newline at end of file

From 29f7485e56c20ce32be5f59a5924a4bb2e939e14 Mon Sep 17 00:00:00 2001
From: pingbowen <pingbowen23@163.com>
Date: Sat, 11 May 2024 10:40:19 +0800
Subject: [PATCH 14/14] support plot et al.

---
 Plot.py           | 28 ++++++++++++++++++++++++++++
 bitdelta/diff2.py | 14 ++++++++++----
 bitdelta/utils.py |  2 ++
 run.sh            | 34 +++++++++++++++++++++++-----------
 run_tailor.sh     | 29 +++++++++++++++++++++++------
 tailor.py         | 36 +++++++++++++++++++++++++-----------
 test.py           | 16 +++++++++-------
 7 files changed, 120 insertions(+), 39 deletions(-)
 create mode 100644 Plot.py

diff --git a/Plot.py b/Plot.py
new file mode 100644
index 0000000..a6d550d
--- /dev/null
+++ b/Plot.py
@@ -0,0 +1,28 @@
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+import argparse
+def plot_bit_delta(title):
+    plt.figure(figsize=(10, 5))
+    plt.plot(bit_delta, label=f'Bit-Delta {map[args.param_type]}')
+    plt.plot(svd_delta, label=f'svd Data {map[args.param_type]}')
+    plt.plot(mix_delta, label=f'Ours {map[args.param_type]}')
+    plt.title("Comparison of the Cosine Similarity between the Bit-Delta, SVD, and our method with WizardMath-7B-v1.0")
+    plt.xlabel(f'{map[args.param_type]} of each layer')  # X轴标题
+    plt.ylabel('Cosine Similarity Value')  # Y轴标题
+    plt.legend()
+    plt.savefig(f'./figures/{map[args.param_type]}_cos_sim.pdf')   
+    plt.show()
+    
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--param_type', type=str,  help='finetuned model name')
+    map = {"q_proj":"Query_proj", "k_proj":"Key_proj","v_proj":"Value_proj","o_proj":"Output_proj","gate_proj":"Gate_proj","up_proj":"Up_proj","down_proj":"Down_proj"}
+    args = parser.parse_args()
+    
+    bit_delta = torch.load(f'./statistic/{args.param_type}_bitdelta_cos_sim.pt')
+    svd_delta = torch.load(f'./statistic/{args.param_type}_svd_cos_sim.pt')
+    mix_delta = torch.load(f'./statistic/{args.param_type}_mix_cos_sim.pt')
+    
+    plot_bit_delta('Cosine Similarity of Bit-Delta, svd and mixed Data')
\ No newline at end of file
diff --git a/bitdelta/diff2.py b/bitdelta/diff2.py
index ab356d3..1afabdc 100644
--- a/bitdelta/diff2.py
+++ b/bitdelta/diff2.py
@@ -123,6 +123,7 @@ def compress_submodule(name, subname, module, submodule):
     # TODO: 根据thresh 选择压缩比例
     param_dict = dict()
     for name, module in finetuned_model.named_modules():
+        # import pdb; pdb.set_trace()
         if "vision" in name:
             continue
         
@@ -162,11 +163,15 @@ def compress_submodule(name, subname, module, submodule):
                             mask , coeff = compressed.mask, compressed.coeff
                             delta = (unpack(mask)*2-1) * coeff
                             delta = delta.T
+                            
+                            if "llava" in args.finetuned_model.lower():
+                                param_dict[f"{name}.{subname}" + ".weight"] = p + delta.to(p.dtype)
+                            
                         elif args.choice == "svd":
-                            dim = 1024
+                            dim = args.dim
                             
                             if "mlp" in name:
-                                dim = int(1024 * 1.45)
+                                dim = int(dim * args.scale_factor)
         
                             U , S , V = decomposition((f - p).clone().detach(),dim=dim)
                             param_dict[f"{name}.{subname}" + ".base"] = p
@@ -182,8 +187,9 @@ def compress_submodule(name, subname, module, submodule):
                             
                         finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(p.to(p.dtype) + delta.to(p.dtype))
     
-    # if "llava" in args.finetuned_model.lower():
-    #     torch.save(param_dict, "/home/pingbowen/workspace/delta-compression/saved_model/llava_svd.pt")                     
+    if "llava" in args.finetuned_model.lower() and args.choice == "bit":
+        torch.save(param_dict, args.svd_dict)
+                             
     if args.choice == "svd":
         torch.save(param_dict, args.svd_dict)
     
diff --git a/bitdelta/utils.py b/bitdelta/utils.py
index 5e59508..3fff503 100644
--- a/bitdelta/utils.py
+++ b/bitdelta/utils.py
@@ -61,6 +61,8 @@ def parse_args():
     parser.add_argument("--split", type=str, default="train")
     parser.add_argument("--lr", type=float, default=1e-4)
     parser.add_argument("--num_steps", type=int, default=100)
+    parser.add_argument("--dim", type=int, default=1024)
+    parser.add_argument("--scale_factor", type=float, default=1.45)
     parser.add_argument("--batch_size", type=int, default=4)
     parser.add_argument("--layers", nargs='+', default=None)
     parser.add_argument("--save_num", type=int, default=0)
diff --git a/run.sh b/run.sh
index 9fe3cbf..37ed843 100644
--- a/run.sh
+++ b/run.sh
@@ -1,30 +1,42 @@
-MODEL_SAVE_DIR=/home/pingbowen/workspace/delta-compression/save/test
+MODEL_SAVE_DIR=/home/pingbowen/workspace/delta-compression/save/mistral-v0.2_bitdelta
 
 mkdir -p $MODEL_SAVE_DIR
 
 values=(0.05 0.2 0.4 0.5 0.75)
 
-pretrained_model=(/data/public/opensource_models/codellama/codellama-7b-python-hf/ /data/public/opensource_models/meta-llama/Llama-2-7b-hf/)
-finetuned_model=(/data/groups/QY_LLM_Other/OSS_Code_LLM/Magicoder-S-CL-7B/ /data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/)
-svd_dict=(/home/pingbowen/workspace/delta-compression/saved_model/magicoder_svd.pt /home/pingbowen/workspace/delta-compression/saved_model/llama_chat_svd.pt)
+pretrained_model=(/data/public/opensource_models/meta-llama/Llama-2-7b-hf/ )
+finetuned_model=(/data/public/wangshuo/exp/ft-en-magicoder-llama-2-7b/ckpts/checkpoints/epoch_2_hf
+)
+svd_dict=(/data/groups/QY_LLM_Other/pingbowen/models/codelora/codelora_svd.pt / )
+save_dir=(/home/pingbowen/workspace/delta-compression/save/test /data/groups/QY_LLM_Other/pingbowen/models/codelora/codelora_bitdelta/)
+
 for (( i=0; i<2; i++ )); do
 
+# choice="svd"
+if [ $i -eq 0 ]; then
+    choice="svd"
+else
+    choice="bit"
+fi
+
 gpu0=$((2 * i))
 gpu1=$((2 * i + 1))
-
-CUDA_VISIBLE_DEVICES="$gpu0,$gpu1" python \
+# "$gpu0,$gpu1"
+CUDA_VISIBLE_DEVICES=$((i + 1)) python \
     bitdelta/train2.py \
-    --base_model ${pretrained_model[$i]} \
-    --finetuned_model ${finetuned_model[$i]} \
-    --save_dir $MODEL_SAVE_DIR \
+    --base_model ${pretrained_model[0]} \
+    --finetuned_model ${finetuned_model[0]}  \
+    --save_dir ${save_dir[$i]} \
     --batch_size 4 \
     --num_steps 200 \
     --save_full_model True \
     --attn_outlier 0.2 \
     --mlp_outlier 0.1 \
     --svd_dict ${svd_dict[$i]} \
-    --choice svd &
-    # &> test.log
+    --dim 1024 \
+    --scale_factor 1.46 \
+    --choice $choice &
+    # &> test.log # ${save_dir[$i]}
 done
 wait
     #  /data/public/opensource_models/codellama/codellama-7b-python-hf/ /data/groups/QY_LLM_Other/OSS_Code_LLM/Magicoder-S-CL-7B/
diff --git a/run_tailor.sh b/run_tailor.sh
index 5149cfc..a3ff799 100644
--- a/run_tailor.sh
+++ b/run_tailor.sh
@@ -1,9 +1,26 @@
-CUDA_VISIBLE_DEVICES=2,3 python tailor.py \
-  --pretrained_model_name /data/public/opensource_models/meta-llama/Llama-2-7b-hf/ \
-  --finetuned_model_name /data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/\
-  --dim 128 \
-  --scale_factor 1.45 \
-  --save_dir /home/pingbowen/save/Llama-2-7b-chat_svd
+pretrained_model=(/data/public/opensource_models/meta-llama/Llama-2-7b-hf/ /data/public/opensource_models/meta-llama/Llama-2-7b-hf/ /data/public/opensource_models/codellama/codellama-7b-python-hf/  /home/pingbowen/models/vicuna-7b-v1.5)
+finetuned_model=(/data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/ /data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/ /data/groups/QY_LLM_Other/OSS_Code_LLM/Magicoder-S-CL-7B/  /home/pingbowen/models/llava-v1.5-7b)
+finetuned_compressed_model=(/home/pingbowen/workspace/delta-compression/saved_model/WizardMath-7B-V1.0_bitdelta/ /data/groups/QY_LLM_Other/pingbowen/models/wizardmath/WizardMath_svd/ /data/groups/QY_LLM_Other/pingbowen/models/wizardmath/delta_1024_mix_32_8_3_2_full/)
+param_types=(q_proj k_proj v_proj o_proj gate_proj up_proj down_proj)
+model_types=(svd bitdelta mix)
+
+for (( j=0; j<${#param_types[@]}; j++ )); do
+  CUDA_VISIBLE_DEVICES=1 python3 Plot.py --param_type ${param_types[$j]}
+done
+# for i in {0..2} 
+# do
+#   for (( j=0; j<${#param_types[@]}; j++ )); do
+#     CUDA_VISIBLE_DEVICES=1,7 python tailor.py \
+#       --pretrained_model_name ${pretrained_model[0]} \
+#       --finetuned_model_name ${finetuned_model[0]} \
+#       --finetuned_compressed_model ${finetuned_compressed_model[$i]} \
+#       --dim 128 \
+#       --scale_factor 1.45 \
+#       --param_type ${param_types[$j]} \
+#       --model_type ${model_types[$i]} \
+#       --save_dir ./statistic/
+#   done
+# done
   
   
 # &
diff --git a/tailor.py b/tailor.py
index 74d2d04..d1f7f88 100755
--- a/tailor.py
+++ b/tailor.py
@@ -15,11 +15,15 @@
 import re
 import random
 import numpy as np
+import torch.nn.functional as F
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--pretrained_model_name', type=str, help='pretrained model name')
 parser.add_argument('--finetuned_model_name', type=str,  help='finetuned model name')
+parser.add_argument('--finetuned_compressed_model', type=str,  help='finetuned model name')
 parser.add_argument('--save_dir', type=str,  help='finetuned model name')
+parser.add_argument('--param_type', type=str,  help='finetuned model name')
+parser.add_argument('--model_type', type=str,  help='finetuned model name')
 parser.add_argument('--dim', type=int,  help='finetuned model name')
 parser.add_argument('--scale_factor', type=float, default=1.45, help='finetuned model name')
 args = parser.parse_args()
@@ -35,6 +39,7 @@
 finetuned_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=finetuned_model_name,torch_dtype=torch.bfloat16).to(device)
 finetuned_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=finetuned_model_name)
 
+finetuned_compressed_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=args.finetuned_compressed_model,torch_dtype=torch.bfloat16).to(device)
 def set_random_seed(seed: int = 0):
     """
     set random seed
@@ -59,21 +64,30 @@ def decomposition(masked_input_tensor,dim):
     # return torch.mm(U, torch.diag(S)), V.t()
     return U @ torch.diag(S) @ V.t()   #return lora_B, lora_A
 
+L2_norm_total,L1_norm_total, mag, num = 0, 0 , 0 ,0
+
+l2_norm ,cos_sim = [],[]
 
 with torch.no_grad():
-    for k,v in finetuned_model.state_dict().items():
+    for k,v in finetuned_compressed_model.state_dict().items():
         dim = args.dim
-        if ".weight" in k:
-            if "self_attn" in k or "mlp" in k:
-                if "mlp" in k:
-                    dim = int(dim * scale_factor)
-                p = pretrained_model.get_submodule(k.replace(".weight", "")).weight
-                delta = decomposition(v - p,dim).to(v.dtype)
-                # import pdb; pdb.set_trace()
-                finetuned_model.get_submodule(k.replace(".weight", "")).weight.copy_(p + delta)
+        if args.param_type in k : # or "mlp" in k
+            # if "mlp" in k:
+            #     dim = int(dim * scale_factor)
+            p = pretrained_model.get_submodule(k.replace(".weight", "")).weight
+            f = finetuned_model.get_submodule(k.replace(".weight", "")).weight
+            delta , compressed_delta = f - p, v - p
+            # l2_norm.append(torch.norm(delta - compressed_delta,2).item())
+            cos_sim.append(torch.mean(F.cosine_similarity(delta, compressed_delta, dim=0),dim=0).item())
+            # L2_norm_total ,L1_norm_total,cos_sim, mag = L2_norm_total + torch.norm(torch.abs(delta) - torch.abs(compressed_delta),2).data, L1_norm_total + torch.norm(torch.abs(delta) - torch.abs(compressed_delta),1).data, cos_sim + F.cosine_similarity(delta, compressed_delta, dim=0), mag + torch.sum(torch.abs(compressed_delta)).data
+            # num += 1
 
-finetuned_model.save_pretrained(save_directory=args.save_dir)
-finetuned_tokenizer.save_pretrained(save_directory=args.save_dir)
 
+print("cos_sim:", cos_sim)
+torch.save(cos_sim, os.path.join(args.save_dir, f"{args.param_type}_{args.model_type}_cos_sim.pt"))
+# print("cos_sim_ave:", cos_sim / num)
+# print("mag_ave:", mag_ave)
+# finetuned_model.save_pretrained(save_directory=args.save_dir)
+# finetuned_tokenizer.save_pretrained(save_directory=args.save_dir)
 
 print("--end--")
diff --git a/test.py b/test.py
index 500e631..55b59a9 100644
--- a/test.py
+++ b/test.py
@@ -134,15 +134,15 @@ def copy_nonzero_values(A, B):
     return A
 
 def load_svd(model):
-    param_dict = torch.load("/home/pingbowen/workspace/delta-compression/saved_model/llava_svd.pt")
-    
+    param_dict = torch.load(args.svd_dict)
+    # import pdb; pdb.set_trace() 
     with torch.no_grad():
         for k,v in param_dict.items():
             if "base" in k:
                 dim = args.dim
                 
                 if "mlp" in k:
-                    dim = int(dim * 1.45)
+                    dim = int(dim * args.scale_factor)
                 
                 k = k.replace(".base", "")
                 
@@ -154,15 +154,17 @@ def load_svd(model):
 
 parser = argparse.ArgumentParser(description="BitDelta")
 parser.add_argument("--dim", type=int, default=128)
+parser.add_argument("--scale_factor", type=float, default=1.45)
+parser.add_argument("--svd_dict", type=str, default="")
 args = parser.parse_args()
 
-tokenizer = AutoTokenizer.from_pretrained("/data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/")
-model = AutoModelForCausalLM.from_pretrained("/data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf/", low_cpu_mem_usage=True, torch_dtype=torch.bfloat16)
+tokenizer = AutoTokenizer.from_pretrained("/data/public/wangshuo/exp/ft-en-metameth-llama-2-7b/ckpts/checkpoints/epoch_2_hf")
+model = AutoModelForCausalLM.from_pretrained("/data/public/wangshuo/exp/ft-en-metameth-llama-2-7b/ckpts/checkpoints/epoch_2_hf", torch_dtype=torch.bfloat16) # low_cpu_mem_usage=True
 
 load_svd(model)
 
-tokenizer.save_pretrained(f"/home/pingbowen/workspace/delta-compression/save/Llama-chat-svd_{args.dim}/")
-model.save_pretrained(f"/home/pingbowen/workspace/delta-compression/save/Llama-chat-svd_{args.dim}/")
+tokenizer.save_pretrained(f"/data/groups/QY_LLM_Other/pingbowen/models/mathlora/math_svd/")
+model.save_pretrained(f"/data/groups/QY_LLM_Other/pingbowen/models/mathlora/math_svd/")
 
 # get_tokenizer("/data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/")
 # save_full_model("/data/public/opensource_models/meta-llama/Llama-2-7b-hf/", "/data/public/opensource_models/WizardLM/WizardMath-7B-V1.0/", os.path.join("/home/pingbowen/workspace/delta-compression/BitDelta/save", "diff_untrained.pt"), os.path.join("/home/pingbowen/workspace/delta-compression/BitDelta/save", "uncalibrated_model"), device="cuda")
\ No newline at end of file