socialcomp
commited on
Commit
•
cd5ed10
1
Parent(s):
e82979a
Fork of MedNER-CR-JA
Browse files- .gitattributes +1 -0
- NER_medNLP.py +238 -0
- README.md +47 -1
- config.json +189 -0
- id_to_tags.pkl +3 -0
- key_attr.pkl +3 -0
- model.safetensors +3 -0
- predict.py +143 -0
- pytorch_model.bin +3 -0
- text.txt +15 -0
.gitattributes
CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
model.safetensors filter=lfs diff=lfs merge=lfs -text
|
NER_medNLP.py
ADDED
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# %%
|
2 |
+
|
3 |
+
import itertools
|
4 |
+
from tqdm import tqdm
|
5 |
+
import numpy as np
|
6 |
+
import torch
|
7 |
+
from transformers import BertJapaneseTokenizer, BertForTokenClassification
|
8 |
+
import pytorch_lightning as pl
|
9 |
+
|
10 |
+
# from torch.utils.data import DataLoader
|
11 |
+
# import from_XML_to_json as XtC
|
12 |
+
# import random
|
13 |
+
# import json
|
14 |
+
# import unicodedata
|
15 |
+
# import pandas as pd
|
16 |
+
|
17 |
+
# %%
|
18 |
+
# 8-16
|
19 |
+
# PyTorch Lightningのモデル
|
20 |
+
class BertForTokenClassification_pl(pl.LightningModule):
|
21 |
+
|
22 |
+
def __init__(self, model_name, num_labels, lr):
|
23 |
+
super().__init__()
|
24 |
+
self.save_hyperparameters()
|
25 |
+
self.bert_tc = BertForTokenClassification.from_pretrained(
|
26 |
+
model_name,
|
27 |
+
num_labels=num_labels
|
28 |
+
)
|
29 |
+
|
30 |
+
def training_step(self, batch, batch_idx):
|
31 |
+
output = self.bert_tc(**batch)
|
32 |
+
loss = output.loss
|
33 |
+
self.log('train_loss', loss)
|
34 |
+
return loss
|
35 |
+
|
36 |
+
def validation_step(self, batch, batch_idx):
|
37 |
+
output = self.bert_tc(**batch)
|
38 |
+
val_loss = output.loss
|
39 |
+
self.log('val_loss', val_loss)
|
40 |
+
|
41 |
+
def configure_optimizers(self):
|
42 |
+
return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
|
43 |
+
|
44 |
+
|
45 |
+
|
46 |
+
# %%
|
47 |
+
class NER_tokenizer_BIO(BertJapaneseTokenizer):
|
48 |
+
|
49 |
+
# 初期化時に固有表現のカテゴリーの数`num_entity_type`を
|
50 |
+
# 受け入れるようにする。
|
51 |
+
def __init__(self, *args, **kwargs):
|
52 |
+
self.num_entity_type = kwargs.pop('num_entity_type')
|
53 |
+
super().__init__(*args, **kwargs)
|
54 |
+
|
55 |
+
def encode_plus_tagged(self, text, entities, max_length):
|
56 |
+
"""
|
57 |
+
文章とそれに含まれる固有表現が与えられた時に、
|
58 |
+
符号化とラベル列の作成を行う。
|
59 |
+
"""
|
60 |
+
# 固有表現の前後でtextを分割し、それぞれのラベルをつけておく。
|
61 |
+
splitted = [] # 分割後の文字列を追加していく
|
62 |
+
position = 0
|
63 |
+
|
64 |
+
for entity in entities:
|
65 |
+
start = entity['span'][0]
|
66 |
+
end = entity['span'][1]
|
67 |
+
label = entity['type_id']
|
68 |
+
splitted.append({'text':text[position:start], 'label':0})
|
69 |
+
splitted.append({'text':text[start:end], 'label':label})
|
70 |
+
position = end
|
71 |
+
splitted.append({'text': text[position:], 'label':0})
|
72 |
+
splitted = [ s for s in splitted if s['text'] ]
|
73 |
+
|
74 |
+
# 分割されたそれぞれの文章をトークン化し、ラベルをつける。
|
75 |
+
tokens = [] # トークンを追加していく
|
76 |
+
labels = [] # ラベルを追加していく
|
77 |
+
for s in splitted:
|
78 |
+
tokens_splitted = self.tokenize(s['text'])
|
79 |
+
label = s['label']
|
80 |
+
if label > 0: # 固有表現
|
81 |
+
# まずトークン全てにI-タグを付与
|
82 |
+
# 番号順O-tag:0, B-tag:1~タグの数,I-tag:タグの数〜
|
83 |
+
labels_splitted = \
|
84 |
+
[ label + self.num_entity_type ] * len(tokens_splitted)
|
85 |
+
# 先頭のトークンをB-タグにする
|
86 |
+
labels_splitted[0] = label
|
87 |
+
else: # それ以外
|
88 |
+
labels_splitted = [0] * len(tokens_splitted)
|
89 |
+
|
90 |
+
tokens.extend(tokens_splitted)
|
91 |
+
labels.extend(labels_splitted)
|
92 |
+
|
93 |
+
# 符号化を行いBERTに入力できる形式にする。
|
94 |
+
input_ids = self.convert_tokens_to_ids(tokens)
|
95 |
+
encoding = self.prepare_for_model(
|
96 |
+
input_ids,
|
97 |
+
max_length=max_length,
|
98 |
+
padding='max_length',
|
99 |
+
truncation=True
|
100 |
+
)
|
101 |
+
|
102 |
+
# ラベルに特殊トークンを追加
|
103 |
+
# max_lengthで切り取って,その前後に[CLS]と[SEP]を追加するためのラベルを入れる
|
104 |
+
labels = [0] + labels[:max_length-2] + [0]
|
105 |
+
# max_lengthに満たない場合は,満たない分を後ろ側に追加する
|
106 |
+
labels = labels + [0]*( max_length - len(labels) )
|
107 |
+
encoding['labels'] = labels
|
108 |
+
|
109 |
+
return encoding
|
110 |
+
|
111 |
+
def encode_plus_untagged(
|
112 |
+
self, text, max_length=None, return_tensors=None
|
113 |
+
):
|
114 |
+
"""
|
115 |
+
文章をトークン化し、それぞれのトークンの文章中の位置も特定しておく。
|
116 |
+
IO法のトークナイザのencode_plus_untaggedと同じ
|
117 |
+
"""
|
118 |
+
# 文章のトークン化を行い、
|
119 |
+
# それぞれのトークンと文章中の文字列を対応づける。
|
120 |
+
tokens = [] # トークンを追加していく。
|
121 |
+
tokens_original = [] # トークンに対応する文章中の文字列を追加していく。
|
122 |
+
words = self.word_tokenizer.tokenize(text) # MeCabで単語に分割
|
123 |
+
for word in words:
|
124 |
+
# 単語をサブワードに分割
|
125 |
+
tokens_word = self.subword_tokenizer.tokenize(word)
|
126 |
+
tokens.extend(tokens_word)
|
127 |
+
if tokens_word[0] == '[UNK]': # 未知語への対応
|
128 |
+
tokens_original.append(word)
|
129 |
+
else:
|
130 |
+
tokens_original.extend([
|
131 |
+
token.replace('##','') for token in tokens_word
|
132 |
+
])
|
133 |
+
|
134 |
+
# 各トークンの文章中での位置を調べる。(空白の位置を考慮する)
|
135 |
+
position = 0
|
136 |
+
spans = [] # トークンの位置を追加していく。
|
137 |
+
for token in tokens_original:
|
138 |
+
l = len(token)
|
139 |
+
while 1:
|
140 |
+
if token != text[position:position+l]:
|
141 |
+
position += 1
|
142 |
+
else:
|
143 |
+
spans.append([position, position+l])
|
144 |
+
position += l
|
145 |
+
break
|
146 |
+
|
147 |
+
# 符号化を行いBERTに入力できる形式にする。
|
148 |
+
input_ids = self.convert_tokens_to_ids(tokens)
|
149 |
+
encoding = self.prepare_for_model(
|
150 |
+
input_ids,
|
151 |
+
max_length=max_length,
|
152 |
+
padding='max_length' if max_length else False,
|
153 |
+
truncation=True if max_length else False
|
154 |
+
)
|
155 |
+
sequence_length = len(encoding['input_ids'])
|
156 |
+
# 特殊トークン[CLS]に対するダミーのspanを追加。
|
157 |
+
spans = [[-1, -1]] + spans[:sequence_length-2]
|
158 |
+
# 特殊トークン[SEP]、[PAD]に対するダミーのspanを追加。
|
159 |
+
spans = spans + [[-1, -1]] * ( sequence_length - len(spans) )
|
160 |
+
|
161 |
+
# 必要に応じてtorch.Tensorにする。
|
162 |
+
if return_tensors == 'pt':
|
163 |
+
encoding = { k: torch.tensor([v]) for k, v in encoding.items() }
|
164 |
+
|
165 |
+
return encoding, spans
|
166 |
+
|
167 |
+
@staticmethod
|
168 |
+
def Viterbi(scores_bert, num_entity_type, penalty=10000):
|
169 |
+
"""
|
170 |
+
Viterbiアルゴリズムで最適解を求める。
|
171 |
+
"""
|
172 |
+
m = 2*num_entity_type + 1
|
173 |
+
penalty_matrix = np.zeros([m, m])
|
174 |
+
for i in range(m):
|
175 |
+
for j in range(1+num_entity_type, m):
|
176 |
+
if not ( (i == j) or (i+num_entity_type == j) ):
|
177 |
+
penalty_matrix[i,j] = penalty
|
178 |
+
path = [ [i] for i in range(m) ]
|
179 |
+
scores_path = scores_bert[0] - penalty_matrix[0,:]
|
180 |
+
scores_bert = scores_bert[1:]
|
181 |
+
|
182 |
+
|
183 |
+
|
184 |
+
for scores in scores_bert:
|
185 |
+
assert len(scores) == 2*num_entity_type + 1
|
186 |
+
score_matrix = np.array(scores_path).reshape(-1,1) \
|
187 |
+
+ np.array(scores).reshape(1,-1) \
|
188 |
+
- penalty_matrix
|
189 |
+
scores_path = score_matrix.max(axis=0)
|
190 |
+
argmax = score_matrix.argmax(axis=0)
|
191 |
+
path_new = []
|
192 |
+
for i, idx in enumerate(argmax):
|
193 |
+
path_new.append( path[idx] + [i] )
|
194 |
+
path = path_new
|
195 |
+
|
196 |
+
labels_optimal = path[np.argmax(scores_path)]
|
197 |
+
return labels_optimal
|
198 |
+
|
199 |
+
def convert_bert_output_to_entities(self, text, scores, spans):
|
200 |
+
"""
|
201 |
+
文章、分類スコア、各トークンの位置から固有表現を得る。
|
202 |
+
分類スコアはサイズが(系列長、ラベル数)の2次元配列
|
203 |
+
"""
|
204 |
+
assert len(spans) == len(scores)
|
205 |
+
num_entity_type = self.num_entity_type
|
206 |
+
|
207 |
+
# 特殊トークンに対応する部分を取り除く
|
208 |
+
scores = [score for score, span in zip(scores, spans) if span[0]!=-1]
|
209 |
+
spans = [span for span in spans if span[0]!=-1]
|
210 |
+
|
211 |
+
# Viterbiアルゴリズムでラベルの予測値を決める。
|
212 |
+
labels = self.Viterbi(scores, num_entity_type)
|
213 |
+
|
214 |
+
# 同じラベルが連続するトークンをまとめて、固有表現を抽出する。
|
215 |
+
entities = []
|
216 |
+
for label, group \
|
217 |
+
in itertools.groupby(enumerate(labels), key=lambda x: x[1]):
|
218 |
+
|
219 |
+
group = list(group)
|
220 |
+
start = spans[group[0][0]][0]
|
221 |
+
end = spans[group[-1][0]][1]
|
222 |
+
|
223 |
+
if label != 0: # 固有表現であれば
|
224 |
+
if 1 <= label <= num_entity_type:
|
225 |
+
# ラベルが`B-`ならば、新しいentityを追加
|
226 |
+
entity = {
|
227 |
+
"name": text[start:end],
|
228 |
+
"span": [start, end],
|
229 |
+
"type_id": label
|
230 |
+
}
|
231 |
+
entities.append(entity)
|
232 |
+
else:
|
233 |
+
# ラベルが`I-`ならば、直近のentityを更新
|
234 |
+
entity['span'][1] = end
|
235 |
+
entity['name'] = text[entity['span'][0]:entity['span'][1]]
|
236 |
+
|
237 |
+
return entities
|
238 |
+
|
README.md
CHANGED
@@ -1,3 +1,49 @@
|
|
1 |
---
|
2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
language:
|
3 |
+
- ja
|
4 |
+
license:
|
5 |
+
- cc-by-4.0
|
6 |
+
tags:
|
7 |
+
- NER
|
8 |
+
- medical documents
|
9 |
+
datasets:
|
10 |
+
- MedTxt-CR-JA-training-v2.xml
|
11 |
+
metrics:
|
12 |
+
- NTCIR-16 Real-MedNLP subtask 1
|
13 |
---
|
14 |
+
|
15 |
+
|
16 |
+
This is a model for named entity recognition of Japanese medical documents.
|
17 |
+
|
18 |
+
### How to use
|
19 |
+
|
20 |
+
Download the following five files and put into the same folder.
|
21 |
+
- id_to_tags.pkl
|
22 |
+
- key_attr.pkl
|
23 |
+
- NER_medNLP.py
|
24 |
+
- predict.py
|
25 |
+
- text.txt (This is an input file which should be predicted, which could be changed.)
|
26 |
+
|
27 |
+
You can use this model by running predict.py.
|
28 |
+
|
29 |
+
```
|
30 |
+
python3 predict.py
|
31 |
+
```
|
32 |
+
|
33 |
+
### Input Example
|
34 |
+
|
35 |
+
```
|
36 |
+
肥大型心筋症、心房細動に対してWF投与が開始となった。
|
37 |
+
治療経過中に非持続性心室頻拍が認められたためアミオダロンが併用となった。
|
38 |
+
```
|
39 |
+
|
40 |
+
### Output Example
|
41 |
+
|
42 |
+
```
|
43 |
+
<d certainty="positive">肥大型心筋症、心房細動</d>に対して<m-key state="executed">WF</m-key>投与が開始となった。
|
44 |
+
<timex3 type="med">治療経過中</timex3>に<d certainty="positive">非持続性心室頻拍</d>が認められたため<m-key state="executed">アミオダロン</m-key>が併用となった。
|
45 |
+
```
|
46 |
+
|
47 |
+
### Publication
|
48 |
+
|
49 |
+
Tomohiro Nishiyama, Aki Ando, Mihiro Nishidani, Shuntaro Yada, Shoko Wakamiya, Eiji Aramaki: NAISTSOC at the NTCIR-16 Real-MedNLP Task, In Proceedings of the 16th NTCIR Conference on Evaluation of Information Access Technologies (NTCIR-16), pp. 330-333, 2022
|
config.json
ADDED
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "cl-tohoku/bert-base-japanese-whole-word-masking",
|
3 |
+
"architectures": [
|
4 |
+
"BertForTokenClassification"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"gradient_checkpointing": false,
|
8 |
+
"hidden_act": "gelu",
|
9 |
+
"hidden_dropout_prob": 0.1,
|
10 |
+
"hidden_size": 768,
|
11 |
+
"id2label": {
|
12 |
+
"0": "LABEL_0",
|
13 |
+
"1": "LABEL_1",
|
14 |
+
"2": "LABEL_2",
|
15 |
+
"3": "LABEL_3",
|
16 |
+
"4": "LABEL_4",
|
17 |
+
"5": "LABEL_5",
|
18 |
+
"6": "LABEL_6",
|
19 |
+
"7": "LABEL_7",
|
20 |
+
"8": "LABEL_8",
|
21 |
+
"9": "LABEL_9",
|
22 |
+
"10": "LABEL_10",
|
23 |
+
"11": "LABEL_11",
|
24 |
+
"12": "LABEL_12",
|
25 |
+
"13": "LABEL_13",
|
26 |
+
"14": "LABEL_14",
|
27 |
+
"15": "LABEL_15",
|
28 |
+
"16": "LABEL_16",
|
29 |
+
"17": "LABEL_17",
|
30 |
+
"18": "LABEL_18",
|
31 |
+
"19": "LABEL_19",
|
32 |
+
"20": "LABEL_20",
|
33 |
+
"21": "LABEL_21",
|
34 |
+
"22": "LABEL_22",
|
35 |
+
"23": "LABEL_23",
|
36 |
+
"24": "LABEL_24",
|
37 |
+
"25": "LABEL_25",
|
38 |
+
"26": "LABEL_26",
|
39 |
+
"27": "LABEL_27",
|
40 |
+
"28": "LABEL_28",
|
41 |
+
"29": "LABEL_29",
|
42 |
+
"30": "LABEL_30",
|
43 |
+
"31": "LABEL_31",
|
44 |
+
"32": "LABEL_32",
|
45 |
+
"33": "LABEL_33",
|
46 |
+
"34": "LABEL_34",
|
47 |
+
"35": "LABEL_35",
|
48 |
+
"36": "LABEL_36",
|
49 |
+
"37": "LABEL_37",
|
50 |
+
"38": "LABEL_38",
|
51 |
+
"39": "LABEL_39",
|
52 |
+
"40": "LABEL_40",
|
53 |
+
"41": "LABEL_41",
|
54 |
+
"42": "LABEL_42",
|
55 |
+
"43": "LABEL_43",
|
56 |
+
"44": "LABEL_44",
|
57 |
+
"45": "LABEL_45",
|
58 |
+
"46": "LABEL_46",
|
59 |
+
"47": "LABEL_47",
|
60 |
+
"48": "LABEL_48",
|
61 |
+
"49": "LABEL_49",
|
62 |
+
"50": "LABEL_50",
|
63 |
+
"51": "LABEL_51",
|
64 |
+
"52": "LABEL_52",
|
65 |
+
"53": "LABEL_53",
|
66 |
+
"54": "LABEL_54",
|
67 |
+
"55": "LABEL_55",
|
68 |
+
"56": "LABEL_56",
|
69 |
+
"57": "LABEL_57",
|
70 |
+
"58": "LABEL_58",
|
71 |
+
"59": "LABEL_59",
|
72 |
+
"60": "LABEL_60",
|
73 |
+
"61": "LABEL_61",
|
74 |
+
"62": "LABEL_62",
|
75 |
+
"63": "LABEL_63",
|
76 |
+
"64": "LABEL_64",
|
77 |
+
"65": "LABEL_65",
|
78 |
+
"66": "LABEL_66",
|
79 |
+
"67": "LABEL_67",
|
80 |
+
"68": "LABEL_68",
|
81 |
+
"69": "LABEL_69",
|
82 |
+
"70": "LABEL_70",
|
83 |
+
"71": "LABEL_71",
|
84 |
+
"72": "LABEL_72",
|
85 |
+
"73": "LABEL_73",
|
86 |
+
"74": "LABEL_74",
|
87 |
+
"75": "LABEL_75",
|
88 |
+
"76": "LABEL_76",
|
89 |
+
"77": "LABEL_77",
|
90 |
+
"78": "LABEL_78",
|
91 |
+
"79": "LABEL_79",
|
92 |
+
"80": "LABEL_80"
|
93 |
+
},
|
94 |
+
"initializer_range": 0.02,
|
95 |
+
"intermediate_size": 3072,
|
96 |
+
"label2id": {
|
97 |
+
"LABEL_0": 0,
|
98 |
+
"LABEL_1": 1,
|
99 |
+
"LABEL_10": 10,
|
100 |
+
"LABEL_11": 11,
|
101 |
+
"LABEL_12": 12,
|
102 |
+
"LABEL_13": 13,
|
103 |
+
"LABEL_14": 14,
|
104 |
+
"LABEL_15": 15,
|
105 |
+
"LABEL_16": 16,
|
106 |
+
"LABEL_17": 17,
|
107 |
+
"LABEL_18": 18,
|
108 |
+
"LABEL_19": 19,
|
109 |
+
"LABEL_2": 2,
|
110 |
+
"LABEL_20": 20,
|
111 |
+
"LABEL_21": 21,
|
112 |
+
"LABEL_22": 22,
|
113 |
+
"LABEL_23": 23,
|
114 |
+
"LABEL_24": 24,
|
115 |
+
"LABEL_25": 25,
|
116 |
+
"LABEL_26": 26,
|
117 |
+
"LABEL_27": 27,
|
118 |
+
"LABEL_28": 28,
|
119 |
+
"LABEL_29": 29,
|
120 |
+
"LABEL_3": 3,
|
121 |
+
"LABEL_30": 30,
|
122 |
+
"LABEL_31": 31,
|
123 |
+
"LABEL_32": 32,
|
124 |
+
"LABEL_33": 33,
|
125 |
+
"LABEL_34": 34,
|
126 |
+
"LABEL_35": 35,
|
127 |
+
"LABEL_36": 36,
|
128 |
+
"LABEL_37": 37,
|
129 |
+
"LABEL_38": 38,
|
130 |
+
"LABEL_39": 39,
|
131 |
+
"LABEL_4": 4,
|
132 |
+
"LABEL_40": 40,
|
133 |
+
"LABEL_41": 41,
|
134 |
+
"LABEL_42": 42,
|
135 |
+
"LABEL_43": 43,
|
136 |
+
"LABEL_44": 44,
|
137 |
+
"LABEL_45": 45,
|
138 |
+
"LABEL_46": 46,
|
139 |
+
"LABEL_47": 47,
|
140 |
+
"LABEL_48": 48,
|
141 |
+
"LABEL_49": 49,
|
142 |
+
"LABEL_5": 5,
|
143 |
+
"LABEL_50": 50,
|
144 |
+
"LABEL_51": 51,
|
145 |
+
"LABEL_52": 52,
|
146 |
+
"LABEL_53": 53,
|
147 |
+
"LABEL_54": 54,
|
148 |
+
"LABEL_55": 55,
|
149 |
+
"LABEL_56": 56,
|
150 |
+
"LABEL_57": 57,
|
151 |
+
"LABEL_58": 58,
|
152 |
+
"LABEL_59": 59,
|
153 |
+
"LABEL_6": 6,
|
154 |
+
"LABEL_60": 60,
|
155 |
+
"LABEL_61": 61,
|
156 |
+
"LABEL_62": 62,
|
157 |
+
"LABEL_63": 63,
|
158 |
+
"LABEL_64": 64,
|
159 |
+
"LABEL_65": 65,
|
160 |
+
"LABEL_66": 66,
|
161 |
+
"LABEL_67": 67,
|
162 |
+
"LABEL_68": 68,
|
163 |
+
"LABEL_69": 69,
|
164 |
+
"LABEL_7": 7,
|
165 |
+
"LABEL_70": 70,
|
166 |
+
"LABEL_71": 71,
|
167 |
+
"LABEL_72": 72,
|
168 |
+
"LABEL_73": 73,
|
169 |
+
"LABEL_74": 74,
|
170 |
+
"LABEL_75": 75,
|
171 |
+
"LABEL_76": 76,
|
172 |
+
"LABEL_77": 77,
|
173 |
+
"LABEL_78": 78,
|
174 |
+
"LABEL_79": 79,
|
175 |
+
"LABEL_8": 8,
|
176 |
+
"LABEL_80": 80,
|
177 |
+
"LABEL_9": 9
|
178 |
+
},
|
179 |
+
"layer_norm_eps": 1e-12,
|
180 |
+
"max_position_embeddings": 512,
|
181 |
+
"model_type": "bert",
|
182 |
+
"num_attention_heads": 12,
|
183 |
+
"num_hidden_layers": 12,
|
184 |
+
"pad_token_id": 0,
|
185 |
+
"position_embedding_type": "absolute",
|
186 |
+
"tokenizer_class": "BertJapaneseTokenizer",
|
187 |
+
"type_vocab_size": 2,
|
188 |
+
"vocab_size": 32000
|
189 |
+
}
|
id_to_tags.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:57e7ea0bc4bdcaf4b19f7eec5c6edf2fce867cc9895cb20079b48881bc32ee5a
|
3 |
+
size 620
|
key_attr.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de3465967518e40f5c2fb99f7baa31bf0f30ac7fb8317607a831bb9183a308fe
|
3 |
+
size 191
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:802f29afc4eae3cbf49f3957f9b2d27ae247e6e079ad6d947ccf181dff7c754c
|
3 |
+
size 440383704
|
predict.py
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# %%
|
3 |
+
from tqdm import tqdm
|
4 |
+
import unicodedata
|
5 |
+
import re
|
6 |
+
import pickle
|
7 |
+
import torch
|
8 |
+
import NER_medNLP as ner
|
9 |
+
from bs4 import BeautifulSoup
|
10 |
+
|
11 |
+
|
12 |
+
# import from_XML_to_json as XtC
|
13 |
+
# import itertools
|
14 |
+
# import random
|
15 |
+
# import json
|
16 |
+
# from torch.utils.data import DataLoader
|
17 |
+
# from transformers import BertJapaneseTokenizer, BertForTokenClassification
|
18 |
+
# import pytorch_lightning as pl
|
19 |
+
# import pandas as pd
|
20 |
+
# import numpy as np
|
21 |
+
# import codecs
|
22 |
+
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
|
23 |
+
|
24 |
+
#%% global変数として使う
|
25 |
+
dict_key = {}
|
26 |
+
|
27 |
+
#%%
|
28 |
+
def to_xml(data):
|
29 |
+
with open("key_attr.pkl", "rb") as tf:
|
30 |
+
key_attr = pickle.load(tf)
|
31 |
+
|
32 |
+
text = data['text']
|
33 |
+
count = 0
|
34 |
+
for i, entities in enumerate(data['entities_predicted']):
|
35 |
+
if entities == "":
|
36 |
+
return
|
37 |
+
span = entities['span']
|
38 |
+
type_id = id_to_tags[entities['type_id']].split('_')
|
39 |
+
tag = type_id[0]
|
40 |
+
|
41 |
+
if not type_id[1] == "":
|
42 |
+
attr = ' ' + value_to_key(type_id[1], key_attr) + '=' + '"' + type_id[1] + '"'
|
43 |
+
else:
|
44 |
+
attr = ""
|
45 |
+
|
46 |
+
add_tag = "<" + str(tag) + str(attr) + ">"
|
47 |
+
text = text[:span[0]+count] + add_tag + text[span[0]+count:]
|
48 |
+
count += len(add_tag)
|
49 |
+
|
50 |
+
add_tag = "</" + str(tag) + ">"
|
51 |
+
text = text[:span[1]+count] + add_tag + text[span[1]+count:]
|
52 |
+
count += len(add_tag)
|
53 |
+
return text
|
54 |
+
|
55 |
+
|
56 |
+
def predict_entities(modelpath, sentences_list, len_num_entity_type):
|
57 |
+
# model = ner.BertForTokenClassification_pl.load_from_checkpoint(
|
58 |
+
# checkpoint_path = modelpath + ".ckpt"
|
59 |
+
# )
|
60 |
+
# bert_tc = model.bert_tc.cuda()
|
61 |
+
|
62 |
+
model = ner.BertForTokenClassification_pl(modelpath, num_labels=81, lr=1e-5)
|
63 |
+
bert_tc = model.bert_tc.to(device)
|
64 |
+
|
65 |
+
MODEL_NAME = 'cl-tohoku/bert-base-japanese-whole-word-masking'
|
66 |
+
tokenizer = ner.NER_tokenizer_BIO.from_pretrained(
|
67 |
+
MODEL_NAME,
|
68 |
+
num_entity_type = len_num_entity_type#Entityの数を変え忘れないように!
|
69 |
+
)
|
70 |
+
|
71 |
+
#entities_list = [] # 正解の固有表現を追加していく
|
72 |
+
entities_predicted_list = [] # 抽出された固有表現を追加していく
|
73 |
+
|
74 |
+
text_entities_set = []
|
75 |
+
for dataset in sentences_list:
|
76 |
+
text_entities = []
|
77 |
+
for sample in tqdm(dataset):
|
78 |
+
text = sample
|
79 |
+
encoding, spans = tokenizer.encode_plus_untagged(
|
80 |
+
text, return_tensors='pt'
|
81 |
+
)
|
82 |
+
encoding = { k: v.to(device) for k, v in encoding.items() }
|
83 |
+
|
84 |
+
with torch.no_grad():
|
85 |
+
output = bert_tc(**encoding)
|
86 |
+
scores = output.logits
|
87 |
+
scores = scores[0].cpu().numpy().tolist()
|
88 |
+
|
89 |
+
# 分類スコアを固有表現に変換する
|
90 |
+
entities_predicted = tokenizer.convert_bert_output_to_entities(
|
91 |
+
text, scores, spans
|
92 |
+
)
|
93 |
+
|
94 |
+
#entities_list.append(sample['entities'])
|
95 |
+
entities_predicted_list.append(entities_predicted)
|
96 |
+
text_entities.append({'text': text, 'entities_predicted': entities_predicted})
|
97 |
+
text_entities_set.append(text_entities)
|
98 |
+
return text_entities_set
|
99 |
+
|
100 |
+
def combine_sentences(text_entities_set, insert: str):
|
101 |
+
documents = []
|
102 |
+
for text_entities in tqdm(text_entities_set):
|
103 |
+
document = []
|
104 |
+
for t in text_entities:
|
105 |
+
document.append(to_xml(t))
|
106 |
+
documents.append('\n'.join(document))
|
107 |
+
return documents
|
108 |
+
|
109 |
+
def value_to_key(value, key_attr):#attributeから属性名を取得
|
110 |
+
global dict_key
|
111 |
+
if dict_key.get(value) != None:
|
112 |
+
return dict_key[value]
|
113 |
+
for k in key_attr.keys():
|
114 |
+
for v in key_attr[k]:
|
115 |
+
if value == v:
|
116 |
+
dict_key[v]=k
|
117 |
+
return k
|
118 |
+
|
119 |
+
# %%
|
120 |
+
if __name__ == '__main__':
|
121 |
+
with open("id_to_tags.pkl", "rb") as tf:
|
122 |
+
id_to_tags = pickle.load(tf)
|
123 |
+
with open("key_attr.pkl", "rb") as tf:
|
124 |
+
key_attr = pickle.load(tf)
|
125 |
+
with open('text.txt') as f:
|
126 |
+
articles_raw = f.read()
|
127 |
+
|
128 |
+
|
129 |
+
article_norm = unicodedata.normalize('NFKC', articles_raw)
|
130 |
+
|
131 |
+
sentences_raw = [s for s in re.split(r'\n', articles_raw) if s != '']
|
132 |
+
sentences_norm = [s for s in re.split(r'\n', article_norm) if s != '']
|
133 |
+
|
134 |
+
text_entities_set = predict_entities("sociocom/RealMedNLP_CR_JA", [sentences_norm], len(id_to_tags))
|
135 |
+
|
136 |
+
|
137 |
+
for i, texts_ent in enumerate(text_entities_set[0]):
|
138 |
+
texts_ent['text'] = sentences_raw[i]
|
139 |
+
|
140 |
+
|
141 |
+
documents = combine_sentences(text_entities_set, '\n')
|
142 |
+
|
143 |
+
print(documents[0])
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ccce71084b8f6e81415e8f8e07cf27f59087aa2fda02c296959322ef8acb8a6a
|
3 |
+
size 440439601
|
text.txt
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
【目的】ワルファリン(WF)は血栓予防を目的に汎用されるが、その効果は個人差が大きく、また併用薬との相互作用により投与量設定に苦慮する場合が多い。
|
3 |
+
我々はWFの適正使用を目的に血中濃度測定を行い、相互作用に関する情報の収集を行っている。
|
4 |
+
本発表ではWFとベンズブロマロン、アミオダロンの併用例について抗凝固能をモニターし、同時にWF血中濃度測定を行い、それに基づいた相互作用の発現機序について考察した。
|
5 |
+
【方法】WF血中濃度は高速液体クロマトグラフィーにて測定を行い、採血時間は朝服用前とした。
|
6 |
+
【症例】《症例1》70歳女性。
|
7 |
+
脳梗塞、左房内血栓に対してWF投与開始となった。
|
8 |
+
後遺症としての痙攣発作に対してカルバマゼピン、またWF増量調節中に尿酸値が高めであったことからベンズブロマロンも併用となった。
|
9 |
+
《症例2》63歳男性。
|
10 |
+
肥大型心筋症、心房細動に対してWF投与が開始となった。
|
11 |
+
治療経過中に非持続性心室頻拍が認められたためアミオダロンが併用となった。
|
12 |
+
【結果・考察】いずれの症例においても、併用により血中濃度および抗凝固能に著明な変動が認められた。
|
13 |
+
症例1では特にS体WFの血中濃度変動が著明であるのに対し、症例2では立体選択的な挙動は認められず、相互作用の発現機序がなると考えられた。
|
14 |
+
前者の相互作用はチトクロムP450(CYP)2C9を介した相互作用であり、多型との関連が注目される。
|
15 |
+
日常臨床ではWFを介した相互作用の頻度は高いと予想され、血中濃度測定と抗凝固能の変動に留意する必要がある。
|