ModelEngine-Group
diff --git a/‎eval/README.md‎
Lines changed: 62 additions & 0 deletions b/‎eval/README.md‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎eval/eval.py‎
Lines changed: 282 additions & 0 deletions b/‎eval/eval.py‎
Lines changed: 282 additions & 0 deletions
@@ -0,0 +1,62 @@
+## Accuracy testing of Sparse method
+
+### Overview
+We use two Chinese subsets of  [LongBench](https://huggingface.co/datasets/zai-org/LongBench) to test the accuracy of single-document QA (multifieldqa_zh) and multi-document QA (dureader). The F1 score is adopted to evaluate the accuracy of these sparse methods. For more information about LongBench, please refer to https://github.com/THUDM/LongBench.
+
+### Quick Start
+
+#### Environment Preparation
+```shell
+pip install jieba fuzzywuzzy rouge
+```
+#### Test Data Preparation
+Dowdload the Longbench dataset 
+
+```shell
+wget https://huggingface.co/datasets/THUDM/LongBench/resolve/main/data.zip && unzip data.zip
+
+```
+
+#### Configure Specific Sparse Method
+
+Settings for different sparse methods are written in a JSON file, for example:
+```python
+{"ESA": 
+    {
+    "init_window_sz": 1,
+    "local_window_sz": 2,
+    "min_blocks":4,
+    "sparse_ratio": 0.2,
+    "retrieval_stride": 10
+    }
+}
+```
+
+Run accuracy testing with:
+```shell
+cd eval
+
+# Run with default settings: Qwen2.5-14B-Instruct batch=20
+bash eval_inference_F1.sh
+
+# Run with custom parameters
+# --strip_think: extract the text after </think> from model predictions
+# --batch:       number of requests processed per batch
+bash eval_inference_F1.sh \
+    --model /home/models/QwQ-32B \
+    --config ./eval/ucm_sparse_config_esa.json \
+    --data ./eval/data \
+    --strip_think 1 \
+    --batch 1
+
+```
+The result files will be saved in the eval/ucm_sparse_predictions folder.
+
+### Results
+Test results of Full Attention (Qwen2.5-14B-Instruct):
+
+| Dataset | F1-Score |
+|-------|-----------:|
+| multifieldqa_zh | 66.6 |
+| dureader | 29.33 |
+
@@ -0,0 +1,282 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import os 
+import json
+import argparse
+import numpy as np
+import re
+import string
+
+import jieba
+from fuzzywuzzy import fuzz
+from collections import Counter
+from rouge import Rouge
+
+
+def extract_pred_after_think(text):
+    if text is None:
+        return ""
+    t = text.strip()
+    idx = t.find("</think>")
+    if idx != -1:
+        return t[idx + len("</think>"):].strip()
+    return t.strip()
+
+def has_think_tag(text):
+    if text is None:
+        return False
+    return ("</think>" in text)
+
+
+def normalize_answer(s):
+    def remove_articles(text):
+        return re.sub(r"\b(a|an|the)\b", " ", text)
+    def white_space_fix(text):
+        return " ".join(text.split())
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+    def lower(text):
+        return text.lower()
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+def normalize_zh_answer(s):
+    def white_space_fix(text):
+        return "".join(text.split())
+    def remove_punc(text):
+        cn_punctuation = "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
+        all_punctuation = set(string.punctuation + cn_punctuation)
+        return "".join(ch for ch in text if ch not in all_punctuation)
+    def lower(text):
+        return text.lower()
+    return white_space_fix(remove_punc(lower(s)))
+
+def count_score(prediction, ground_truth, **kwargs):
+    numbers = re.findall(r"\d+", prediction)
+    right_num = 0
+    for number in numbers:
+        if str(number) == str(ground_truth):
+            right_num += 1
+    final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
+    return float(final_score)
+
+def retrieval_score(prediction, ground_truth, **kwargs):
+    pattern = r'Paragraph (\d+)'
+    matches = re.findall(pattern, ground_truth)
+    ground_truth_id = matches[0]
+    numbers = re.findall(r"\d+", prediction)
+    right_num = 0
+    for number in numbers:
+        if str(number) == str(ground_truth_id):
+            right_num += 1
+    final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
+    return float(final_score)
+
+def retrieval_zh_score(prediction, ground_truth, **kwargs):
+    pattern = r'段落(\d+)'
+    matches = re.findall(pattern, ground_truth)
+    ground_truth_id = matches[0]
+    numbers = re.findall(r"\d+", prediction)
+    right_num = 0
+    for number in numbers:
+        if str(number) == str(ground_truth_id):
+            right_num += 1
+    final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
+    return float(final_score)
+
+def code_sim_score(prediction, ground_truth, **kwargs):
+    all_lines = prediction.lstrip('\n').split('\n')
+    prediction = ""
+    for line in all_lines:
+        if ('`' not in line) and ('#' not in line) and ('//' not in line):
+            prediction = line
+            break
+    return (fuzz.ratio(prediction, ground_truth) / 100)
+
+def classification_score(prediction, ground_truth, **kwargs):
+    em_match_list = []
+    all_classes = kwargs["all_classes"]
+    for class_name in all_classes:
+        if class_name in prediction:
+            em_match_list.append(class_name)
+    for match_term in em_match_list:
+        if match_term in ground_truth and match_term != ground_truth:
+            em_match_list.remove(match_term)
+    if ground_truth in em_match_list:
+        score = (1.0 / len(em_match_list))
+    else:
+        score = 0.0
+    return score
+    
+def rouge_score(prediction, ground_truth, **kwargs):
+    rouge = Rouge()
+    try:
+        scores = rouge.get_scores([prediction], [ground_truth], avg=True)
+    except:
+        return 0.0
+    return scores["rouge-l"]["f"]
+
+def rouge_zh_score(prediction, ground_truth, **kwargs):
+    prediction = " ".join(list(jieba.cut(prediction, cut_all=False)))
+    ground_truth = " ".join(list(jieba.cut(ground_truth, cut_all=False))) 
+    score = rouge_score(prediction, ground_truth)
+    return score
+
+def f1_score(prediction, ground_truth, **kwargs):
+    common = Counter(prediction) & Counter(ground_truth)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction)
+    recall = 1.0 * num_same / len(ground_truth)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+def qa_f1_score(prediction, ground_truth, **kwargs):
+    normalized_prediction = normalize_answer(prediction)
+    normalized_ground_truth = normalize_answer(ground_truth)
+    prediction_tokens = normalized_prediction.split()
+    ground_truth_tokens = normalized_ground_truth.split()
+    return f1_score(prediction_tokens, ground_truth_tokens)
+
+def qa_f1_zh_score(prediction, ground_truth, **kwargs):
+    prediction_tokens = list(jieba.cut(prediction, cut_all=False))
+    ground_truth_tokens = list(jieba.cut(ground_truth, cut_all=False))
+    prediction_tokens = [normalize_zh_answer(token) for token in prediction_tokens]
+    ground_truth_tokens = [normalize_zh_answer(token) for token in ground_truth_tokens]
+    prediction_tokens = [token for token in prediction_tokens if len(token) > 0]
+    ground_truth_tokens = [token for token in ground_truth_tokens if len(token) > 0]
+    return f1_score(prediction_tokens, ground_truth_tokens)
+
+
+dataset2metric = {
+    "narrativeqa": qa_f1_score,
+    "qasper": qa_f1_score,
+    "multifieldqa_en": qa_f1_score,
+    "multifieldqa_zh": qa_f1_zh_score,
+    "clongeval": qa_f1_zh_score,
+    "hotpotqa": qa_f1_score,
+    "2wikimqa": qa_f1_score,
+    "musique": qa_f1_score,
+    "dureader": rouge_zh_score,
+    "gov_report": rouge_score,
+    "qmsum": rouge_score,
+    "multi_news": rouge_score,
+    "vcsum": rouge_zh_score,
+    "trec": classification_score,
+    "triviaqa": qa_f1_score,
+    "samsum": rouge_score,
+    "lsht": classification_score,
+    "passage_retrieval_en": retrieval_score,
+    "passage_count": count_score,
+    "passage_retrieval_zh": retrieval_zh_score,
+    "lcc": code_sim_score,
+    "repobench-p": code_sim_score,
+}
+
+
+def parse_args(args=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', type=str, default=None)
+    parser.add_argument('--answer', type=str, default=None)
+    parser.add_argument('--dataset', type=str, default=None)
+    parser.add_argument('--strip_think', action='store_true', help="Extract </think> after content")
+    parser.add_argument('--e', action='store_true', help="Evaluate on LongBench-E")
+    return parser.parse_args(args)
+
+def scorer_e(dataset, predictions, answers, lengths, all_classes):
+    scores = {"0-4k": [], "4-8k": [], "8k+": []}
+    for (prediction, ground_truths, length) in zip(predictions, answers, lengths):
+        score = 0.
+        if dataset in ["trec", "triviaqa", "samsum", "lsht"]:
+            prediction = prediction.lstrip('\n').split('\n')[0]
+        for ground_truth in ground_truths:
+            score = max(score, dataset2metric[dataset](prediction, ground_truth, all_classes=all_classes))
+        if length < 4000:
+            scores["0-4k"].append(score)
+        elif length < 8000:
+            scores["4-8k"].append(score)
+        else:
+            scores["8k+"].append(score)
+    for key in scores.keys():
+        scores[key] = round(100 * np.mean(scores[key]), 2)
+    return scores
+
+def scorer(dataset, predictions, answers, all_classes):
+    total_score = 0.
+    # count = 0
+    for (prediction, ground_truths) in zip(predictions, answers):
+        score = 0.
+        if dataset in ["trec", "triviaqa", "samsum", "lsht"]:
+            prediction = prediction.lstrip('\n').split('\n')[0]
+        for ground_truth in ground_truths:
+            score = max(score, dataset2metric[dataset](prediction, ground_truth, all_classes=all_classes))
+
+        total_score += score
+    return round(100 * total_score / len(predictions), 2)
+
+def fix_json_format(line):
+    line = re.sub(r'"answers": \[\[(.*?)\]\]', r'"answers": [\1]', line)
+
+    line = line.replace("'", '"')
+    line = line.replace("None", "null")
+    line = line.strip().replace("\n", "").replace("\r", "").replace("\t", "")
+    pattern = re.compile(r'"pred":"(.*?)"(?=,)', re.DOTALL)
+    def escape_quotes(match):
+        escaped_value = match.group(1).replace('"', '\\"')
+        return f'"pred":"{escaped_value}"'
+    
+    line = pattern.sub(escape_quotes, line)
+
+    pattern = re.compile(r'"answers":\s*\[([^\]]+)\]', re.DOTALL)
+    def escape_quotes_in_answers(match):
+        internel_content = match.group(1)
+
+        items = internel_content.split('","')
+        # import pdb; pdb.set_trace()
+        escaped_items = [item.replace('"', '\\"') for item in items]
+        
+        escaped_content = '","'.join(escaped_items)
+        
+        return f'"answers": ["{escaped_content}"]'
+    line = pattern.sub(escape_quotes_in_answers, line)
+
+    return line
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    predictions, answers, lengths = [], [], []
+    all_classes = None
+    with open(args.answer, "r", encoding="utf-8") as f:
+        for line in f:
+            data = json.loads(line)
+            pred_raw = data["pred"]
+            if args.strip_think:
+                if not has_think_tag(pred_raw):
+                    continue
+                pred_clean = extract_pred_after_think(pred_raw)
+                print(pred_clean)
+            else:
+                pred_clean = pred_raw
+
+            predictions.append(pred_clean)
+            answers.append(data["answers"])
+
+            if "length" in data:
+                lengths.append(data["length"])
+
+    print("----"*10)
+    print("有效条数:", len(predictions))
+    print("----"*10)
+
+    if args.e:
+        score = scorer_e(args.dataset, predictions, answers, lengths, all_classes)
+        print("All score:", score)
+    else:
+        score50 = scorer(args.dataset, predictions[:50], answers[:50], all_classes)
+        score_all = scorer(args.dataset, predictions, answers, all_classes)
+        print("50 score:", score50)
+        print("All score:", score_all)