|
import mojimoji |
|
import numpy as np |
|
from transformers import AutoTokenizer, AutoModelForTokenClassification |
|
|
|
import iob_util |
|
|
|
model_name = "gabrielandrade2/point-to-span-estimation" |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForTokenClassification.from_pretrained(model_name) |
|
|
|
|
|
text = "肥大型心⧫筋症、心房⧫細動に対してWF投与が開始となった。\ |
|
治療経過中に非持続性心⧫室頻拍が認められたためアミオダロンが併用となった。" |
|
|
|
|
|
text = mojimoji.han_to_zen(text) |
|
tokenized = tokenizer.tokenize(text) |
|
|
|
|
|
input_ids = tokenizer.encode(text, return_tensors="pt") |
|
|
|
|
|
output = model(input_ids) |
|
logits = output[0].detach().cpu().numpy() |
|
tags = np.argmax(logits, axis=2)[:, :].tolist()[0] |
|
|
|
|
|
id2label = model.config.id2label |
|
tags = [id2label[t] for t in tags] |
|
|
|
|
|
tokens = [tokenizer.convert_ids_to_tokens(t) for t in input_ids][0] |
|
|
|
|
|
tags = [y for x, y in zip(tokens, tags) if x not in ['[CLS]', '[SEP]', '[PAD]']] |
|
tokens = [x for x in tokens if x not in ['[CLS]', '[SEP]', '[PAD]']] |
|
|
|
|
|
xml_text = iob_util.convert_iob_to_xml(tokens, tags) |
|
xml_text = xml_text.replace('⧫', '') |
|
print(xml_text) |
|
|