Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement bert-base-japanese-sentiment #975

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,7 @@ The collection of pre-trained, state-of-the-art AI models.
|[bert_tweets_sentiment](/natural_language_processing/bert_tweets_sentiment) | [huggingface/transformers](https://github.com/huggingface/transformers) | Pytorch | 1.2.5 and later |
|[gpt2](/natural_language_processing/gpt2) | [GPT-2](https://github.com/onnx/models/blob/master/text/machine_comprehension/gpt-2/README.md) | Pytorch | 1.2.7 and later |
|[rinna_gpt2](/natural_language_processing/rinna_gpt2) | [japanese-pretrained-models](https://github.com/rinnakk/japanese-pretrained-models) | Pytorch | 1.2.7 and later |
|[bert_base_japanese_sentiment](/natural_language_processing/bert_base_japanese_sentiment) | [bert-base-japanese-sentiment](https://huggingface.co/daigo/bert-base-japanese-sentiment) | Pytorch | 1.2.7 and later |

## Neural Rendering

Expand Down
47 changes: 47 additions & 0 deletions natural_language_processing/bert_base_japanese_sentiment/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Hugging Face - daigo/bert-base-japanese-sentiment

## Input

A SENTENCE.

- Sample
```
私は幸せである。
```

## Output

Recognized emotions
```
ポジティブ : 0.9903476238250732
```

## Usage
Automatically downloads the onnx and prototxt files on the first run.
It is necessary to be connected to the Internet while downloading.

For the sample sentence,
```bash
$ python3 bert_base_japanese_sentiment.py
```

If you want to specify the `SENTENCE`, put after the `--input` option.
```bash
$ python3 bert_base_japanese_sentiment.py --input SENTENCE
```

## Reference

[Hugging Face - daigo/bert-base-japanese-sentiment](https://huggingface.co/daigo/bert-base-japanese-sentiment)

## Framework

Pytorch

## Model Format

ONNX opset=11

## Netron

[model.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/bert_base_japanese_sentiment/model.onnx.prototxt)
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
import sys
import time

import numpy as np
import cv2
from PIL import Image

from transformers import AutoTokenizer

import ailia

# import original modules
sys.path.append('../../util')
from utils import get_base_parser, update_parser, get_savepath # noqa
from model_utils import check_and_download_models # noqa
from math_utils import softmax
# logger
from logging import getLogger # noqa

logger = getLogger(__name__)

# ======================
# Parameters
# ======================

WEIGHT_PATH = "model.onnx"
MODEL_PATH = "model.onnx.prototxt"
REMOTE_PATH = 'https://storage.googleapis.com/ailia-models/bert_base_japanese_sentiment/'

DEFAULT_TEXT = "私は幸せである。"

# ======================
# Arguemnt Parser Config
# ======================

parser = get_base_parser(
'bert-base-japanese-sentiment', None, None
)
parser.add_argument(
'--input', '-i', default=DEFAULT_TEXT
)
parser.add_argument(
'--onnx',
action='store_true',
help='execute onnxruntime version.'
)
args = update_parser(parser, check_input_type=False)


# ======================
# Main functions
# ======================

def preprocess(tokenizer, sequence):
encoded_sequence = tokenizer.encode(sequence)
encoded_sequence = np.array(encoded_sequence)

encoded_sequence = np.expand_dims(encoded_sequence, axis=0)
encoded_sequence = encoded_sequence.astype(np.int64)

return encoded_sequence


def post_processing(classifier, output):
output = classifier["weight"] @ output
logits = output + classifier["bias"]

scores = softmax(logits)

return scores


def predict(model_info, sequence):
tokenizer = model_info["tokenizer"]
net = model_info["model"]
classifier = model_info["classifier"]

input_ids = preprocess(tokenizer, sequence)
attention_mask = np.ones(input_ids.shape, dtype=np.int64)
token_type_ids = np.zeros(input_ids.shape, dtype=np.int64)

# feedforward
if not args.onnx:
output = net.predict([input_ids, attention_mask, token_type_ids])
else:
output = net.run(None, {
'input_ids': input_ids, 'attention_mask': attention_mask, 'token_type_ids': token_type_ids
})

last_hidden_state, pooled_output = output

scores = post_processing(classifier, pooled_output[0])

return scores


def recognize_from_text(model_info):
# prepare input data
sequence = args.input

logger.info("Input : " + sequence)

# inference
logger.info('Start inference...')
if args.benchmark:
logger.info('BENCHMARK mode')
total_time_estimation = 0
for i in range(args.benchmark_count):
start = int(round(time.time() * 1000))
scores = predict(model_info, sequence)
end = int(round(time.time() * 1000))
estimation_time = (end - start)

# Logging
logger.info(f'\tailia processing estimation time {estimation_time} ms')
if i != 0:
total_time_estimation = total_time_estimation + estimation_time

logger.info(f'\taverage time estimation {total_time_estimation / (args.benchmark_count - 1)} ms')
else:
scores = predict(model_info, sequence)

id2label = {
0: 'ポジティブ',
1: 'ネガティブ'
}
i = np.argmax(scores)
label = id2label[i]
score = scores[i]

logger.info(f'{label} : {score}')

logger.info('Script finished successfully.')


def main():
# model files check and download
check_and_download_models(WEIGHT_PATH, MODEL_PATH, REMOTE_PATH)

# initialize
tokenizer = AutoTokenizer.from_pretrained("tokenizer")

if not args.onnx:
net = ailia.Net(MODEL_PATH, WEIGHT_PATH, env_id=args.env_id)
else:
import onnxruntime
net = onnxruntime.InferenceSession(WEIGHT_PATH)

classifier = np.load("classifier.npy", allow_pickle=True).item()

model_info = {
"tokenizer": tokenizer,
"model": net,
"classifier": classifier,
}

recognize_from_text(model_info)


if __name__ == '__main__':
main()
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"cls_token": "[CLS]",
"mask_token": "[MASK]",
"pad_token": "[PAD]",
"sep_token": "[SEP]",
"unk_token": "[UNK]"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"cls_token": "[CLS]",
"do_lower_case": false,
"do_subword_tokenize": true,
"do_word_tokenize": true,
"jumanpp_kwargs": null,
"mask_token": "[MASK]",
"max_len": 512,
"mecab_kwargs": null,
"name_or_path": "daigo/bert-base-japanese-sentiment",
"never_split": null,
"pad_token": "[PAD]",
"sep_token": "[SEP]",
"special_tokens_map_file": "/home/ooe/.cache/huggingface/hub/models--daigo--bert-base-japanese-sentiment/snapshots/51ac2d2c0a5645d77ca26078fc5f02c349fbb93d/special_tokens_map.json",
"subword_tokenizer_type": "wordpiece",
"sudachi_kwargs": null,
"tokenizer_class": "BertJapaneseTokenizer",
"unk_token": "[UNK]",
"word_tokenizer_type": "mecab"
}
Loading