# DAEDRA: Determining Adverse Event Disposition for Regulatory Affairs

DAEDRA is a language model intended to predict the disposition (outcome) of an adverse event based on the text of the event report. Intended to be used to classify reports in passive reporting systems, it is trained on the [VAERS](https://vaers.hhs.gov/) dataset, which contains reports of adverse events following vaccination in the United States.

In [1]:
# %pip install accelerate -U

In [2]:
%pip install transformers datasets shap watermark wandb scikit-multilearn evaluate codecarbon

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
import torch
import os
from typing import List, Union
from sklearn.metrics import f1_score, accuracy_score, classification_report
from transformers import AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments, DataCollatorWithPadding, pipeline
from datasets import load_dataset, Dataset, DatasetDict
from pyarrow import Table
import shap
import wandb
import evaluate
from codecarbon import EmissionsTracker

os.environ["TOKENIZERS_PARALLELISM"] = "false"
tracker = EmissionsTracker()

%load_ext watermark

  from .autonotebook import tqdm as notebook_tqdm
2024-01-28 19:47:15.508449: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-01-28 19:47:16.502791: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-01-28 19:47:16.502915: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [4]:
device: str = 'cuda' if torch.cuda.is_available() else 'cpu'

SEED: int = 42

BATCH_SIZE: int = 16
EPOCHS: int = 3
model_ckpt: str = "distilbert-base-uncased"

# WandB configuration
os.environ["WANDB_PROJECT"] = "DAEDRA multiclass model training" 
os.environ["WANDB_LOG_MODEL"] = "checkpoint"  # log all model checkpoints
os.environ["WANDB_NOTEBOOK_NAME"] = "DAEDRA.ipynb"

In [5]:
%watermark --iversion

numpy   : 1.23.5
re      : 2.2.1
evaluate: 0.4.1
pandas  : 2.0.2
wandb   : 0.16.2
shap    : 0.44.1
torch   : 1.12.0
logging : 0.5.1.2



In [6]:
!nvidia-smi

Sun Jan 28 19:47:19 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-PCIE-16GB           Off | 00000001:00:00.0 Off |                  Off |
| N/A   29C    P0              25W / 250W |      4MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE-16GB           Off | 00000002:00:00.0 Off |  

## Loading the data set

In [7]:
dataset = load_dataset("chrisvoncsefalvay/vaers-outcomes")

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 1270444
    })
    test: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 272238
    })
    val: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 272238
    })
})

In [9]:
SUBSAMPLING = 0.1

if SUBSAMPLING < 1:
    _ = DatasetDict()
    for each in dataset.keys():
        _[each] = dataset[each].shuffle(seed=SEED).select(range(int(len(dataset[each]) * SUBSAMPLING)))

    dataset = _

## Tokenisation and encoding

In [10]:
def encode_ds(ds: Union[Dataset, DatasetDict], tokenizer_model: str = model_ckpt) -> Union[Dataset, DatasetDict]:
    return ds_enc

## Evaluation metrics

In [11]:
accuracy = evaluate.load("accuracy")
precision, recall = evaluate.load("precision"), evaluate.load("recall")
f1 = evaluate.load("f1")

In [12]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        'accuracy': accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        'precision_macroaverage': precision.compute(predictions=predictions, references=labels, average='macro')["precision"],
        'precision_microaverage': precision.compute(predictions=predictions, references=labels, average='micro')["precision"],
        'recall_macroaverage': recall.compute(predictions=predictions, references=labels, average='macro')["recall"],
        'recall_microaverage': recall.compute(predictions=predictions, references=labels, average='micro')["recall"],
        'f1_microaverage': f1.compute(predictions=predictions, references=labels, average='micro')["f1"]
    }

## Training

We specify a label map – this has to be done manually, even if `Datasets` has a function for it, as `AutoModelForSequenceClassification` requires an object with a length :(

In [13]:
label_map = {i: label for i, label in enumerate(dataset["test"].features["label"].names)}

In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

cols = dataset["train"].column_names
cols.remove("label")
ds_enc = dataset.map(lambda x: tokenizer(x["text"], truncation=True), batched=True, remove_columns=cols)

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, 
    num_labels=len(dataset["test"].features["label"].names), 
    id2label=label_map, 
    label2id={v:k for k,v in label_map.items()})

args = TrainingArguments(
    output_dir="vaers",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=.01,
    logging_steps=1,
    load_best_model_at_end=True,
    run_name=f"daedra-training",
    report_to=["wandb"])

trainer = Trainer(
        model=model,
        args=args,
        train_dataset=ds_enc["train"],
        eval_dataset=ds_enc["test"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics)

Map: 100%|██████████| 127044/127044 [00:53<00:00, 2384.54 examples/s]
Map: 100%|██████████| 27223/27223 [00:11<00:00, 2396.71 examples/s]
Map: 100%|██████████| 27223/27223 [00:11<00:00, 2375.38 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
if SUBSAMPLING != 1.0:
    wandb_tag: List[str] = [f"subsample-{SUBSAMPLING}"]
else:
    wandb_tag: List[str] = [f"full_sample"]

wandb_tag.append(f"batch_size-{BATCH_SIZE}")
wandb_tag.append(f"base:{model_ckpt}")
    
wandb.init(name="daedra_training_run", tags=wandb_tag, magic=True)

[34m[1mwandb[0m: Currently logged in as: [33mchrisvoncsefalvay[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [16]:
tracker.start()
trainer.train()
tracker.stop()


Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macroaverage,Precision Microaverage,Recall Macroaverage,Recall Microaverage,F1 Microaverage
1,0.2513,0.362917,0.865775,0.701081,0.865775,0.55657,0.865775,0.865775
2,0.036,0.352118,0.870551,0.728051,0.870551,0.609787,0.870551,0.870551


[34m[1mwandb[0m: Adding directory to artifact (./vaers/checkpoint-3971)... Done. 18.2s
Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.
[34m[1mwandb[0m: Adding directory to artifact (./vaers/checkpoint-7942)... 

In [None]:
wandb.finish()

0,1
eval/accuracy,▁▇█
eval/f1_microaverage,▁▇█
eval/loss,█▃▁
eval/precision_macroaverage,▁▇█
eval/precision_microaverage,▁▇█
eval/recall_macroaverage,▁▇█
eval/recall_microaverage,▁▇█
eval/runtime,▁▃█
eval/samples_per_second,█▆▁
eval/steps_per_second,█▆▁

0,1
eval/accuracy,0.84019
eval/f1_microaverage,0.84019
eval/loss,0.44011
eval/precision_macroaverage,0.415
eval/precision_microaverage,0.84019
eval/recall_macroaverage,0.40704
eval/recall_microaverage,0.84019
eval/runtime,10.0118
eval/samples_per_second,271.878
eval/steps_per_second,8.59


In [None]:
variant = "full_sample" if SUBSAMPLING == 1.0 else f"subsample-{SUBSAMPLING}"
tokenizer._tokenizer.save("tokenizer.json")
tokenizer.push_to_hub("chrisvoncsefalvay/daedra")
sample = "full sample" if SUBSAMPLING == 1.0 else f"{SUBSAMPLING * 100}% of the full sample"

model.push_to_hub("chrisvoncsefalvay/daedra", 
                  variant=variant,
                  commit_message=f"DAEDRA model trained on {sample} of the VAERS dataset (training set size: {dataset['train'].num_rows:,})")

CommitInfo(commit_url='https://huggingface.co/chrisvoncsefalvay/daedra/commit/c482ca6c8520142a3e67df4be25a408e6b557053', commit_message='DAEDRA model trained on 1.0% of the full sample of the VAERS dataset (training set size: 12,704)', commit_description='', oid='c482ca6c8520142a3e67df4be25a408e6b557053', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from collections import Counter

def get_most_frequent_unknown_tokens(tokenizer, dataset):
    unknown_tokens = []
    
    # Tokenize each text in the dataset
    for example in dataset:
        tokens = tokenizer.tokenize(example['text'])
        
        # Check if each token is the 'unknown' special token
        for token in tokens:
            if token == tokenizer.unk_token:
                unknown_tokens.append(token)
    
    # Count the frequency of each unique unknown token
    token_counts = Counter(unknown_tokens)
    
    # Sort the tokens based on their frequency in descending order
    most_frequent_tokens = token_counts.most_common()
    
    return most_frequent_tokens

# Example usage
tokenizer = YourTokenizer()  # Replace with your tokenizer
dataset = YourDataset()  # Replace with your dataset

most_frequent_unknown_tokens = get_most_frequent_unknown_tokens(tokenizer, dataset)
print(most_frequent_unknown_tokens)
