MedNER-CR-JA / NER_medNLP.py

Upload NER_medNLP.py

1ca6f42 about 2 years ago

9.34 kB

	# %%

	import itertools
	from tqdm import tqdm
	import numpy as np
	import torch
	from transformers import BertJapaneseTokenizer, BertForTokenClassification
	import pytorch_lightning as pl

	# from torch.utils.data import DataLoader
	# import from_XML_to_json as XtC
	# import random
	# import json
	# import unicodedata
	# import pandas as pd

	# %%
	# 8-16
	# PyTorch Lightningのモデル
	class BertForTokenClassification_pl(pl.LightningModule):

	def __init__(self, model_name, num_labels, lr):
	super().__init__()
	self.save_hyperparameters()
	self.bert_tc = BertForTokenClassification.from_pretrained(
	model_name,
	num_labels=num_labels
	)

	def training_step(self, batch, batch_idx):
	output = self.bert_tc(**batch)
	loss = output.loss
	self.log('train_loss', loss)
	return loss

	def validation_step(self, batch, batch_idx):
	output = self.bert_tc(**batch)
	val_loss = output.loss
	self.log('val_loss', val_loss)

	def configure_optimizers(self):
	return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)



	# %%
	class NER_tokenizer_BIO(BertJapaneseTokenizer):

	# 初期化時に固有表現のカテゴリーの数`num_entity_type`を
	# 受け入れるようにする。
	def __init__(self, args, *kwargs):
	self.num_entity_type = kwargs.pop('num_entity_type')
	super().__init__(args, *kwargs)

	def encode_plus_tagged(self, text, entities, max_length):
	"""
	文章とそれに含まれる固有表現が与えられた時に、
	符号化とラベル列の作成を行う。
	"""
	# 固有表現の前後でtextを分割し、それぞれのラベルをつけておく。
	splitted = [] # 分割後の文字列を追加していく
	position = 0

	for entity in entities:
	start = entity['span'][0]
	end = entity['span'][1]
	label = entity['type_id']
	splitted.append({'text':text[position:start], 'label':0})
	splitted.append({'text':text[start:end], 'label':label})
	position = end
	splitted.append({'text': text[position:], 'label':0})
	splitted = [ s for s in splitted if s['text'] ]

	# 分割されたそれぞれの文章をトークン化し、ラベルをつける。
	tokens = [] # トークンを追加していく
	labels = [] # ラベルを追加していく
	for s in splitted:
	tokens_splitted = self.tokenize(s['text'])
	label = s['label']
	if label > 0: # 固有表現
	# まずトークン全てにI-タグを付与
	# 番号順O-tag:0, B-tag:1~タグの数，I-tag:タグの数〜
	labels_splitted = \
	[ label + self.num_entity_type ] * len(tokens_splitted)
	# 先頭のトークンをB-タグにする
	labels_splitted[0] = label
	else: # それ以外
	labels_splitted = [0] * len(tokens_splitted)

	tokens.extend(tokens_splitted)
	labels.extend(labels_splitted)

	# 符号化を行いBERTに入力できる形式にする。
	input_ids = self.convert_tokens_to_ids(tokens)
	encoding = self.prepare_for_model(
	input_ids,
	max_length=max_length,
	padding='max_length',
	truncation=True
	)

	# ラベルに特殊トークンを追加
	# max_lengthで切り取って，その前後に[CLS]と[SEP]を追加するためのラベルを入れる
	labels = [0] + labels[:max_length-2] + [0]
	# max_lengthに満たない場合は，満たない分を後ろ側に追加する
	labels = labels + [0]*( max_length - len(labels) )
	encoding['labels'] = labels

	return encoding

	def encode_plus_untagged(
	self, text, max_length=None, return_tensors=None
	):
	"""
	文章をトークン化し、それぞれのトークンの文章中の位置も特定しておく。
	IO法のトークナイザのencode_plus_untaggedと同じ
	"""
	# 文章のトークン化を行い、
	# それぞれのトークンと文章中の文字列を対応づける。
	tokens = [] # トークンを追加していく。
	tokens_original = [] # トークンに対応する文章中の文字列を追加していく。
	words = self.word_tokenizer.tokenize(text) # MeCabで単語に分割
	for word in words:
	# 単語をサブワードに分割
	tokens_word = self.subword_tokenizer.tokenize(word)
	tokens.extend(tokens_word)
	if tokens_word[0] == '[UNK]': # 未知語への対応
	tokens_original.append(word)
	else:
	tokens_original.extend([
	token.replace('##','') for token in tokens_word
	])

	# 各トークンの文章中での位置を調べる。（空白の位置を考慮する）
	position = 0
	spans = [] # トークンの位置を追加していく。
	for token in tokens_original:
	l = len(token)
	while 1:
	if token != text[position:position+l]:
	position += 1
	else:
	spans.append([position, position+l])
	position += l
	break

	# 符号化を行いBERTに入力できる形式にする。
	input_ids = self.convert_tokens_to_ids(tokens)
	encoding = self.prepare_for_model(
	input_ids,
	max_length=max_length,
	padding='max_length' if max_length else False,
	truncation=True if max_length else False
	)
	sequence_length = len(encoding['input_ids'])
	# 特殊トークン[CLS]に対するダミーのspanを追加。
	spans = [[-1, -1]] + spans[:sequence_length-2]
	# 特殊トークン[SEP]、[PAD]に対するダミーのspanを追加。
	spans = spans + [[-1, -1]] * ( sequence_length - len(spans) )

	# 必要に応じてtorch.Tensorにする。
	if return_tensors == 'pt':
	encoding = { k: torch.tensor([v]) for k, v in encoding.items() }

	return encoding, spans

	@staticmethod
	def Viterbi(scores_bert, num_entity_type, penalty=10000):
	"""
	Viterbiアルゴリズムで最適解を求める。
	"""
	m = 2*num_entity_type + 1
	penalty_matrix = np.zeros([m, m])
	for i in range(m):
	for j in range(1+num_entity_type, m):
	if not ( (i == j) or (i+num_entity_type == j) ):
	penalty_matrix[i,j] = penalty
	path = [ [i] for i in range(m) ]
	scores_path = scores_bert[0] - penalty_matrix[0,:]
	scores_bert = scores_bert[1:]



	for scores in scores_bert:
	assert len(scores) == 2*num_entity_type + 1
	score_matrix = np.array(scores_path).reshape(-1,1) \
	+ np.array(scores).reshape(1,-1) \
	- penalty_matrix
	scores_path = score_matrix.max(axis=0)
	argmax = score_matrix.argmax(axis=0)
	path_new = []
	for i, idx in enumerate(argmax):
	path_new.append( path[idx] + [i] )
	path = path_new

	labels_optimal = path[np.argmax(scores_path)]
	return labels_optimal

	def convert_bert_output_to_entities(self, text, scores, spans):
	"""
	文章、分類スコア、各トークンの位置から固有表現を得る。
	分類スコアはサイズが（系列長、ラベル数）の2次元配列
	"""
	assert len(spans) == len(scores)
	num_entity_type = self.num_entity_type

	# 特殊トークンに対応する部分を取り除く
	scores = [score for score, span in zip(scores, spans) if span[0]!=-1]
	spans = [span for span in spans if span[0]!=-1]

	# Viterbiアルゴリズムでラベルの予測値を決める。
	labels = self.Viterbi(scores, num_entity_type)

	# 同じラベルが連続するトークンをまとめて、固有表現を抽出する。
	entities = []
	for label, group \
	in itertools.groupby(enumerate(labels), key=lambda x: x[1]):

	group = list(group)
	start = spans[group[0][0]][0]
	end = spans[group[-1][0]][1]

	if label != 0: # 固有表現であれば
	if 1 <= label <= num_entity_type:
	# ラベルが`B-`ならば、新しいentityを追加
	entity = {
	"name": text[start:end],
	"span": [start, end],
	"type_id": label
	}
	entities.append(entity)
	else:
	# ラベルが`I-`ならば、直近のentityを更新
	entity['span'][1] = end
	entity['name'] = text[entity['span'][0]:entity['span'][1]]

	return entities