reference_processor/confidence_manager.py at main · katrina-666/reference_processor · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
"""
置信度管理模块
负责评估 BibTeX 记录的完整性并计算置信度。
"""

import bibtexparser
from bibtexparser.bparser import BibTexParser
from bibtexparser.customization import convert_to_unicode


def calculate_confidence(bibtex_entry):
    """
    计算 BibTeX 记录的置信度并生成缺项警告。

    参数:
        bibtex_entry (dict): bibtexparser 解析出的文献条目字典

    返回:
        tuple: (置信度评分 float, 缺项警告字符串 str)
    """
    # 定义关键字段及其权重
    # 权重总和为 1.0
    field_weights = {
        'title': 0.20,      # 标题 - 重要
        'author': 0.20,     # 作者 - 重要
        'journal': 0.15,    # 期刊/书名 - 重要
        'year': 0.15,       # 年份 - 重要
        'doi': 0.15,        # DOI - 高权重（最可靠）
        'pages': 0.10,      # 页码 - 中等
        'volume': 0.05,     # 卷号 - 较低
    }

    missing_fields = []
    confidence_score = 1.0

    # 检查每个字段
    for field, weight in field_weights.items():
        value = bibtex_entry.get(field, '').strip()

        if not value:
            # 字段缺失，扣除相应权重
            confidence_score -= weight
            missing_fields.append(field)

    # 确保置信度在 0.0-1.0 范围内
    confidence_score = max(0.0, min(1.0, confidence_score))

    # 生成缺项警告字符串
    warning = ""
    if missing_fields:
        # 将字段名转换为中文
        field_names_cn = {
            'title': '标题',
            'author': '作者',
            'journal': '期刊',
            'year': '年份',
            'doi': 'DOI',
            'pages': '页码',
            'volume': '卷号',
        }

        missing_cn = [field_names_cn.get(f, f) for f in missing_fields]
        missing_str = '、'.join(missing_cn)

        # 根据置信度确定警告级别
        if confidence_score >= 0.8:
            warning = f"缺少：{missing_str}"
        elif confidence_score >= 0.6:
            warning = f"缺少{missing_str}"
        else:
            warning = f"缺少{missing_str}"
    else:
        warning = "完整"

    return confidence_score, warning


def evaluate_bibtex_string(bibtex_string):
    """
    从 BibTeX 字符串计算置信度。

    参数:
        bibtex_string (str): BibTeX 格式字符串

    返回:
        tuple: (置信度评分 float, 缺项警告字符串 str, entry dict)
        如果解析失败，返回 (0.0, "BibTeX 解析失败", None)
    """
    if not bibtex_string:
        return 0.0, "BibTeX 数据为空", None

    try:
        # 解析 BibTeX
        parser = BibTexParser()
        parser.customization = convert_to_unicode
        bib_database = bibtexparser.loads(bibtex_string, parser=parser)

        if not bib_database.entries:
            return 0.0, "BibTeX 解析失败：无条目", None

        entry = bib_database.entries[0]
        confidence, warning = calculate_confidence(entry)

        return confidence, warning, entry

    except Exception as e:
        return 0.0, f"BibTeX 解析错误：{str(e)}", None


def get_confidence_level(score):
    """
    根据置信度评分返回级别描述。

    参数:
        score (float): 置信度评分 (0.0-1.0)

    返回:
        str: 级别描述
    """
    if score >= 0.9:
        return score + "高"
    elif score >= 0.7:
        return score + "中"
    elif score >= 0.5:
        return score + "低"
    else:
        return score + "极低"


if __name__ == '__main__':
    # 测试代码
    test_bibtex = """
    @article{test2023,
      author = {Smith, John and Doe, Jane},
      title = {Test Article},
      journal = {Test Journal},
      year = {2023},
      volume = {10},
      pages = {123-145},
      doi = {10.1000/test}
    }
    """

    confidence, warning, entry = evaluate_bibtex_string(test_bibtex)
    print(f"置信度: {confidence:.2f} ({get_confidence_level(confidence)})")
    print(f"警告: {warning}")