# DAEDRA: Determining Adverse Event Disposition for Regulatory Affairs

DAEDRA is a language model intended to predict the disposition (outcome) of an adverse event based on the text of the event report. Intended to be used to classify reports in passive reporting systems, it is trained on the [VAERS](https://vaers.hhs.gov/) dataset, which contains reports of adverse events following vaccination in the United States.

In [1]:
%pip install accelerate -U

/bin/bash: /anaconda/envs/azureml_py38_PT_TF/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install transformers datasets shap watermark wandb evaluate codecarbon

/bin/bash: /anaconda/envs/azureml_py38_PT_TF/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Note: you may need to restart the kernel to use updated packages.


In [28]:
import pandas as pd
import numpy as np
import torch
import os
from typing import List, Union
from transformers import AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments, DataCollatorWithPadding, pipeline, AutoModel
from datasets import load_dataset, Dataset, DatasetDict
import shap
import wandb
import evaluate
import logging

wandb.finish()


os.environ["TOKENIZERS_PARALLELISM"] = "false"

%load_ext watermark

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark


In [4]:
device: str = 'cuda' if torch.cuda.is_available() else 'cpu'

SEED: int = 42

BATCH_SIZE: int = 32
EPOCHS: int = 5
model_ckpt: str = "distilbert-base-uncased"

# WandB configuration
os.environ["WANDB_PROJECT"] = "DAEDRA multiclass model training" 
os.environ["WANDB_LOG_MODEL"] = "checkpoint"  # log all model checkpoints
os.environ["WANDB_NOTEBOOK_NAME"] = "DAEDRA.ipynb"

In [5]:
%watermark --iversion

re      : 2.2.1
torch   : 1.12.0
wandb   : 0.16.2
logging : 0.5.1.2
numpy   : 1.23.5
pandas  : 2.0.2
evaluate: 0.4.1
shap    : 0.44.1



In [6]:
!nvidia-smi

/bin/bash: /anaconda/envs/azureml_py38_PT_TF/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Mon Jan 29 15:20:22 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-PCIE-16GB           Off | 00000001:00:00.0 Off |                  Off |
| N/A   27C    P0              25W / 250W |      4MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+------------------------------------

## Loading the data set

In [7]:
dataset = load_dataset("chrisvoncsefalvay/vaers-outcomes")

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 1270444
    })
    test: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 272238
    })
    val: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 272238
    })
})

In [8]:
SUBSAMPLING = 0.01

if SUBSAMPLING < 1:
    _ = DatasetDict()
    for each in dataset.keys():
        _[each] = dataset[each].shuffle(seed=SEED).select(range(int(len(dataset[each]) * SUBSAMPLING)))

    dataset = _

## Tokenisation and encoding

In [10]:
def encode_ds(ds: Union[Dataset, DatasetDict], tokenizer_model: str = model_ckpt) -> Union[Dataset, DatasetDict]:
    return ds_enc

## Evaluation metrics

In [9]:
accuracy = evaluate.load("accuracy")
precision, recall = evaluate.load("precision"), evaluate.load("recall")
f1 = evaluate.load("f1")

In [10]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        'accuracy': accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        'precision_macroaverage': precision.compute(predictions=predictions, references=labels, average='macro')["precision"],
        'precision_microaverage': precision.compute(predictions=predictions, references=labels, average='micro')["precision"],
        'recall_macroaverage': recall.compute(predictions=predictions, references=labels, average='macro')["recall"],
        'recall_microaverage': recall.compute(predictions=predictions, references=labels, average='micro')["recall"],
        'f1_microaverage': f1.compute(predictions=predictions, references=labels, average='micro')["f1"]
    }

## Training

We specify a label map â€“ this has to be done manually, even if `Datasets` has a function for it, as `AutoModelForSequenceClassification` requires an object with a length :(

In [11]:
label_map = {i: label for i, label in enumerate(dataset["test"].features["label"].names)}

In [12]:
def train_from_model(model_ckpt: str, push: bool = False):
    print(f"Initialising training based on {model_ckpt}...")

    print("Tokenising...")
    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

    cols = dataset["train"].column_names
    cols.remove("label")
    ds_enc = dataset.map(lambda x: tokenizer(x["text"], truncation=True, max_length=512), batched=True, remove_columns=cols)

    print("Loading model...")
    try:
        model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, 
                                                                    num_labels=len(dataset["test"].features["label"].names), 
                                                                    id2label=label_map, 
                                                                    label2id={v:k for k,v in label_map.items()})
    except OSError:
        model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, 
                                                            num_labels=len(dataset["test"].features["label"].names), 
                                                            id2label=label_map, 
                                                            label2id={v:k for k,v in label_map.items()},
                                                            from_tf=True)


    args = TrainingArguments(
        output_dir="vaers",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=EPOCHS,
        weight_decay=.01,
        logging_steps=1,
        load_best_model_at_end=True,
        run_name=f"daedra-training",
        report_to=["wandb"])

    trainer = Trainer(
            model=model,
            args=args,
            train_dataset=ds_enc["train"],
            eval_dataset=ds_enc["test"],
            tokenizer=tokenizer,
            compute_metrics=compute_metrics)
    
    if SUBSAMPLING != 1.0:
        wandb_tag: List[str] = [f"subsample-{SUBSAMPLING}"]
    else:
        wandb_tag: List[str] = [f"full_sample"]

    wandb_tag.append(f"batch_size-{BATCH_SIZE}")
    wandb_tag.append(f"base:{model_ckpt}")
        
    wandb.init(name=f"daedra_{SUBSAMPLING}-{model_ckpt}", tags=wandb_tag, magic=True)

    print("Starting training...")

    trainer.train()

    print("Training finished.")

    if push:
        variant = "full_sample" if SUBSAMPLING == 1.0 else f"subsample-{SUBSAMPLING}"
        tokenizer._tokenizer.save("tokenizer.json")
        tokenizer.push_to_hub("chrisvoncsefalvay/daedra")
        sample = "full sample" if SUBSAMPLING == 1.0 else f"{SUBSAMPLING * 100}% of the full sample"

        model.push_to_hub("chrisvoncsefalvay/daedra", 
                        variant=variant,
                        commit_message=f"DAEDRA model trained on {sample} of the VAERS dataset (training set size: {dataset['train'].num_rows:,}), based on {model_ckpt}")

In [13]:

base_models = [
    "bert-base-uncased",
    "distilbert-base-uncased",
]