|
|
|
|
|
|
|
|
|
|
|
|
|
from datasets import load_dataset |
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer |
|
from transformers import TrainingArguments |
|
from transformers import Trainer |
|
from sklearn.metrics import accuracy_score, f1_score |
|
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix |
|
import torch |
|
import matplotlib.pyplot as plt |
|
import numpy as np |
|
|
|
|
|
print('gpu available:',torch.cuda.is_available()) |
|
|
|
|
|
|
|
|
|
|
|
dataset = load_dataset("tyqiangz/multilingual-sentiments", "japanese") |
|
|
|
|
|
|
|
dataset.set_format(type='pandas') |
|
train_df = dataset['train'][:] |
|
|
|
|
|
def label_int2str(x): |
|
return dataset["train"].features["label"].int2str(x) |
|
|
|
train_df["label_name"] = train_df["label"].apply(label_int2str) |
|
|
|
|
|
dataset.reset_format() |
|
|
|
|
|
from transformers import AutoTokenizer |
|
|
|
model_ckpt = "cl-tohoku/bert-base-japanese-whole-word-masking" |
|
tokenizer = AutoTokenizer.from_pretrained(model_ckpt) |
|
|
|
|
|
def tokenize(batch): |
|
return tokenizer(batch["text"], padding=True, truncation=True) |
|
|
|
|
|
dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None) |
|
|
|
|
|
|
|
|
|
|
|
import torch |
|
from transformers import AutoModelForSequenceClassification |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
print(device) |
|
num_labels = 3 |
|
|
|
model = (AutoModelForSequenceClassification |
|
.from_pretrained(model_ckpt, num_labels=num_labels) |
|
.to(device)) |
|
|
|
|
|
from sklearn.metrics import accuracy_score, f1_score |
|
|
|
def compute_metrics(pred): |
|
labels = pred.label_ids |
|
preds = pred.predictions.argmax(-1) |
|
f1 = f1_score(labels, preds, average="weighted") |
|
acc = accuracy_score(labels, preds) |
|
return {"accuracy": acc, "f1": f1} |
|
|
|
|
|
from transformers import TrainingArguments |
|
|
|
batch_size = 16 |
|
logging_steps = len(dataset_encoded["train"]) // batch_size |
|
model_name = "sample-text-classification-bert" |
|
|
|
training_args = TrainingArguments( |
|
output_dir=model_name, |
|
num_train_epochs=10, |
|
learning_rate=2e-5, |
|
per_device_train_batch_size=batch_size, |
|
per_device_eval_batch_size=batch_size, |
|
weight_decay=0.01, |
|
evaluation_strategy="epoch", |
|
disable_tqdm=False, |
|
logging_steps=logging_steps, |
|
push_to_hub=False, |
|
log_level="error" |
|
) |
|
|
|
|
|
from transformers import Trainer |
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
compute_metrics=compute_metrics, |
|
train_dataset=dataset_encoded["train"], |
|
eval_dataset=dataset_encoded["validation"], |
|
tokenizer=tokenizer |
|
) |
|
print('start training..') |
|
trainer.train() |
|
|
|
|
|
|
|
id2label = {} |
|
for i in range(dataset["train"].features["label"].num_classes): |
|
id2label[i] = dataset["train"].features["label"].int2str(i) |
|
|
|
label2id = {} |
|
for i in range(dataset["train"].features["label"].num_classes): |
|
label2id[dataset["train"].features["label"].int2str(i)] = i |
|
|
|
trainer.model.config.id2label = id2label |
|
trainer.model.config.label2id = label2id |
|
|
|
|
|
|
|
print('save model.') |
|
trainer.save_model('sample-text-classification-bert') |
|
|