import pandas as pd import numpy as np import torch import os from typing import List, Union from transformers import AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments, DataCollatorWithPadding, pipeline, AutoModel from datasets import load_dataset, Dataset, DatasetDict import shap import wandb import evaluate import logging from codecarbon import EmissionsTracker tracker = EmissionsTracker() os.environ["TOKENIZERS_PARALLELISM"] = "false" device: str = 'cuda' if torch.cuda.is_available() else 'cpu' SEED: int = 42 BATCH_SIZE: int = 16 EPOCHS: int = 3 SUBSAMPLING: float = 1 # WandB configuration os.environ["WANDB_PROJECT"] = "DAEDRA final model training" os.environ["WANDB_LOG_MODEL"] = "checkpoint" # log all model checkpoints dataset = load_dataset("chrisvoncsefalvay/vaers-outcomes") if SUBSAMPLING < 1: _ = DatasetDict() for each in dataset.keys(): _[each] = dataset[each].shuffle(seed=SEED).select(range(int(len(dataset[each]) * SUBSAMPLING))) dataset = _ accuracy = evaluate.load("accuracy") precision, recall = evaluate.load("precision"), evaluate.load("recall") f1 = evaluate.load("f1") def compute_metrics(eval_pred): predictions, labels = eval_pred predictions = np.argmax(predictions, axis=1) return { 'accuracy': accuracy.compute(predictions=predictions, references=labels)["accuracy"], 'precision_macroaverage': precision.compute(predictions=predictions, references=labels, average='macro')["precision"], 'precision_microaverage': precision.compute(predictions=predictions, references=labels, average='micro')["precision"], 'recall_macroaverage': recall.compute(predictions=predictions, references=labels, average='macro')["recall"], 'recall_microaverage': recall.compute(predictions=predictions, references=labels, average='micro')["recall"], 'f1_microaverage': f1.compute(predictions=predictions, references=labels, average='micro')["f1"] } label_map = {i: label for i, label in enumerate(dataset["test"].features["label"].names)} def train_from_model(model_ckpt: str, push: bool = False): print(f"Initialising training based on {model_ckpt}...") print("Tokenising...") tokenizer = AutoTokenizer.from_pretrained(model_ckpt) cols = dataset["train"].column_names cols.remove("label") ds_enc = dataset.map(lambda x: tokenizer(x["text"], truncation=True, max_length=512), batched=True, remove_columns=cols) print("Loading model...") try: model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=len(dataset["test"].features["label"].names), id2label=label_map, label2id={v:k for k,v in label_map.items()}) except OSError: model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=len(dataset["test"].features["label"].names), id2label=label_map, label2id={v:k for k,v in label_map.items()}, from_tf=True) args = TrainingArguments( output_dir="daedra", evaluation_strategy="steps", eval_steps=1000, save_steps=2000, save_strategy="steps", learning_rate=2e-5, per_device_train_batch_size=BATCH_SIZE, per_device_eval_batch_size=BATCH_SIZE, num_train_epochs=EPOCHS, weight_decay=.01, logging_steps=1, run_name=f"daedra-full-train", report_to=["wandb", "codecarbon"], save_total_limit=2, load_best_model_at_end=True, push_to_hub=True, push_to_hub_model_id="daedra", hub_strategy="every_save", metric_for_best_model="f1_microaverage") trainer = Trainer( model=model, args=args, train_dataset=ds_enc["train"], eval_dataset=ds_enc["test"], tokenizer=tokenizer, compute_metrics=compute_metrics) wandb_tag: List[str] = ["full_sample"] wandb_tag.append(f"batch_size-{BATCH_SIZE}") wandb_tag.append(f"base:{model_ckpt}") if "/" in model_ckpt: sanitised_model_name = model_ckpt.split("/")[1] else: sanitised_model_name = model_ckpt wandb.init(name=f"daedra_{SUBSAMPLING}-{sanitised_model_name}", tags=wandb_tag, magic=True) print("Starting training...") tracker.start() trainer.train() tracker.stop() print("Training finished.") wandb.finish() if __name__ == "__main__": wandb.finish() train_from_model("dmis-lab/biobert-base-cased-v1.2")