sample-text-classification-bert / SentimentAnalysis.py
A-Funakoshi's picture
Upload 8 files
5e1810a
# %% [markdown]
# ## Hugging Faceを使って事前学習モデルを日本語の感情分析用にファインチューニングしてみた
# 以下で紹介されているコードを写経したもの
# https://dev.classmethod.jp/articles/huggingface-jp-text-classification/
# %%
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments
from transformers import Trainer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import torch
import matplotlib.pyplot as plt
import numpy as np
# %%
print('gpu available:',torch.cuda.is_available())
# %% [markdown]
# ## データセット
# %%
dataset = load_dataset("tyqiangz/multilingual-sentiments", "japanese")
# %%
# データフレームとして扱う
dataset.set_format(type='pandas')
train_df = dataset['train'][:]
# %%
def label_int2str(x):
return dataset["train"].features["label"].int2str(x)
train_df["label_name"] = train_df["label"].apply(label_int2str)
# %%
dataset.reset_format()
# %%
from transformers import AutoTokenizer
model_ckpt = "cl-tohoku/bert-base-japanese-whole-word-masking"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
# %%
def tokenize(batch):
return tokenizer(batch["text"], padding=True, truncation=True)
# %%
dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None)
# %% [markdown]
# ## モデル
# %%
import torch
from transformers import AutoModelForSequenceClassification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
num_labels = 3
model = (AutoModelForSequenceClassification
.from_pretrained(model_ckpt, num_labels=num_labels)
.to(device))
# %%
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
f1 = f1_score(labels, preds, average="weighted")
acc = accuracy_score(labels, preds)
return {"accuracy": acc, "f1": f1}
# %%
from transformers import TrainingArguments
batch_size = 16
logging_steps = len(dataset_encoded["train"]) // batch_size
model_name = "sample-text-classification-bert"
training_args = TrainingArguments(
output_dir=model_name,
num_train_epochs=10,
learning_rate=2e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=0.01,
evaluation_strategy="epoch",
disable_tqdm=False,
logging_steps=logging_steps,
push_to_hub=False,
log_level="error"
)
# %%
from transformers import Trainer
trainer = Trainer(
model=model,
args=training_args,
compute_metrics=compute_metrics,
train_dataset=dataset_encoded["train"],
eval_dataset=dataset_encoded["validation"],
tokenizer=tokenizer
)
print('start training..')
trainer.train()
# %%
# ラベル情報付与
id2label = {}
for i in range(dataset["train"].features["label"].num_classes):
id2label[i] = dataset["train"].features["label"].int2str(i)
label2id = {}
for i in range(dataset["train"].features["label"].num_classes):
label2id[dataset["train"].features["label"].int2str(i)] = i
trainer.model.config.id2label = id2label
trainer.model.config.label2id = label2id
# %%
# 保存
print('save model.')
trainer.save_model('sample-text-classification-bert')