QN_AI_Hackathon_2022/utils.py at main · CTA404NotFound/QN_AI_Hackathon_2022 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import torch
import numpy as np


def sigmoid(pred):
    """Custom Sigmoid function"""
    return 1 / (1 + np.exp(-pred))


def softmax(pred):
    """Custom Softmax function"""
    maxes = np.max(pred, axis=-1, keepdims=True)
    shifted_exp = np.exp(pred - maxes)
    return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)


def word_segmentation(text, segmenter):
    """
    Custom word segmentation using VnCoreNLP toolkit
    """
    output = segmenter.tokenize(text)
    tokens = [t for ts in output for t in ts]
    processed_review = " ".join(tokens)
    return processed_review


def convert_tokens_to_features(texts, tokenizer, segmenter, max_seq_length=256, labels=None):
    """Tokenizing sentences into input ids and attention_mask

    Args:
        texts (list): list of reviews.
        tokenizer: tokenizer of specific pre-trained model.
        segmenter: word segmentation toolkit.
        max_seq_length (int, optional): Sequence length of vectors. Defaults to 256.
        labels (int, optional): _description_. Defaults to None.

    Returns:
        Tensor: input ids, attention_masks of reviews and their respective labels.
    """
    input_ids, attention_masks = [], []
    for text in texts:
        text = word_segmentation(text, segmenter)
        encodings = tokenizer.encode_plus(
            text, padding = "max_length", max_length = max_seq_length, truncation=True)
        input_ids.append(encodings["input_ids"])
        attention_masks.append(encodings["attention_mask"])

    if labels is not None:
        return torch.tensor(input_ids, dtype=torch.float32), \
            torch.tensor(attention_masks, dtype=torch.float32), \
            torch.tensor(labels, dtype=torch.float32)

    return torch.tensor(input_ids, dtype=torch.float32), \
        torch.tensor(attention_masks, dtype=torch.float32)

def convert_output_format(output_one_hot_tensor):
    """Convert the model's output vector into standard vector format following the organizer's ones

    Args:
        output_onehot_tensor (Tensor): The model's output vector

    Returns:
        List: List of sentimental polarity scores of aspects following the organizer's format
    """
    output_tensor = torch.reshape(output_one_hot_tensor, shape = (-1,))
    standard_output = []
    for i in range(0, len(output_tensor), 5):
        if not any(output_tensor[i: i + 5]):
            standard_output.append(0)
        for j in range(0, 5):
            if output_tensor[i + j] == 1:
                standard_output.append(j + 1)
    return standard_output