studies/code/my/nlp/ner_utils.py at master · yanwenheng/studies · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
Time: 2021-09-18 3:31 下午

Author: huayang

Subject:

"""
import doctest

from typing import Dict

import numpy as np

__all__ = [
    'ner_result_parse'
]


def ner_result_parse(tokens, labels,
                     label_id2name: Dict[int, str],
                     token_id2name: Dict[int, str] = None):
    """@NLP Utils
    NER 结果解析（基于 BIO 格式）

    Examples:
        >>> _label_id2name = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-LOC', 4: 'I-LOC'}
        >>> _tokens = list('你知道小明生活在北京吗？')
        >>> _labels = list(map(int, '000120003400'))
        >>> ner_result_parse(_tokens, _labels, _label_id2name)
        [['PER', '小明', (3, 4)], ['LOC', '北京', (8, 9)]]

        >>> _tokens = list('小明生活在北京')  # 测试头尾是否正常
        >>> _labels = list(map(int, '1200034'))
        >>> ner_result_parse(_tokens, _labels, label_id2name=_label_id2name)
        [['PER', '小明', (0, 1)], ['LOC', '北京', (5, 6)]]

        >>> _tokens = list('明生活在北京')  # 明: I-PER
        >>> _labels = list(map(int, '200034'))
        >>> ner_result_parse(_tokens, _labels, label_id2name=_label_id2name)
        [['LOC', '北京', (4, 5)]]

        >>> _tokens = list('小明生活在北')
        >>> _labels = list(map(int, '120003'))  # 北: B-LOC
        >>> ner_result_parse(_tokens, _labels, label_id2name=_label_id2name)
        [['PER', '小明', (0, 1)], ['LOC', '北', (5, 5)]]

    Args:
        tokens:
        labels:
        token_id2name:
        label_id2name:

    Returns:
        example: [['小明', 'PER', (3, 4)], ['北京', 'LOC', (8, 9)]]
    """
    INIT_IDX = -1

    def _init():
        return '', INIT_IDX, INIT_IDX

    def get_tag():
        try:
            return label.split('-')[1]
        except:  # noqa
            return '_SPAN'  # 针对 'B'/'I' 而非 'B-XX'/'I-XX' 的情况

    def chunks_append():
        span = ''.join(tokens[beg: end + 1])
        chunks.append([tag, span, (beg, end)])

    # if masks is not None:
    #     tokens = np.asarray(tokens)[np.asarray(masks, dtype=bool)].tolist()
    #     labels = np.asarray(labels)[np.asarray(masks, dtype=bool)].tolist()

    if token_id2name is not None:
        tokens = [token_id2name.get(t, t) for t in tokens]
    if label_id2name is not None:
        labels = [label_id2name.get(t, t) for t in labels]

    assert len(tokens) == len(labels)
    SEQ_LEN = len(tokens) - 1

    chunks = []
    tag, beg, end = _init()
    for idx, (token, label) in enumerate(zip(tokens, labels)):
        if label.startswith('B'):
            if end != INIT_IDX:
                chunks_append()
            tag = get_tag()
            beg = end = idx

            if end == SEQ_LEN:
                chunks_append()
        elif label.startswith('I') and beg != INIT_IDX:
            _tag = get_tag()
            if _tag == tag:
                end = idx

            if end == SEQ_LEN:
                chunks_append()
        else:
            if end != INIT_IDX:
                chunks_append()
            tag, beg, end = _init()

    return chunks


def _test():
    """"""
    doctest.testmod()


if __name__ == '__main__':
    """"""
    _test()