# DAEDRA: Determining Adverse Event Disposition for Regulatory Affairs

DAEDRA is a language model intended to predict the disposition (outcome) of an adverse event based on the text of the event report. Intended to be used to classify reports in passive reporting systems, it is trained on the [VAERS](https://vaers.hhs.gov/) dataset, which contains reports of adverse events following vaccination in the United States.

In [2]:
import pandas as pd
import numpy as np
import torch
import os
from typing import List
from datasets import load_dataset
import shap
from sklearn.metrics import f1_score, accuracy_score, classification_report
from transformers import AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments, pipeline

%load_ext watermark

ModuleNotFoundError: No module named 'torch'

In [None]:
device: str = 'cuda' if torch.cuda.is_available() else 'cpu'

SEED: int = 42

BATCH_SIZE: int = 8
EPOCHS: int = 1
model_ckpt: str = "distilbert-base-uncased"

CLASS_NAMES: List[str] = ["DIED",
 "ER_VISIT",
 "HOSPITAL",
 "OFC_VISIT",
 "X_STAY",
 "DISABLE",
 "D_PRESENTED"]

# WandB configuration
os.environ["WANDB_PROJECT"] = "DAEDRA model training" # name your W&B project
os.environ["WANDB_LOG_MODEL"] = "checkpoint" # log all model checkpoints

In [None]:
%watermark --iversion

In [4]:
!nvidia-smi

Sun Jan 28 01:31:42 2024 
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
| 0 Tesla V100-PCIE-16GB Off | 00000001:00:00.0 Off | Off |
| N/A 28C P0 37W / 250W | 0MiB / 16384MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
| 1 Tesla V100-PCIE-16GB Off | 00000002:00:00.0 Off | Off |
| N/A 29C P0 36W / 250W | 0MiB / 16384MiB | 1% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
 
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI 

## Loading the data set

In [None]:
dataset = load_dataset("chrisvoncsefalvay/vaers-outcomes")

### Tokenisation and encoding

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
def tokenize_and_encode(examples):
 return tokenizer(examples["text"], truncation=True)

In [None]:
cols = dataset["train"].column_names
cols.remove("labels")
ds_enc = dataset.map(tokenize_and_encode, batched=True, remove_columns=cols)

### Training

In [None]:
class MultiLabelTrainer(Trainer):
 def compute_loss(self, model, inputs, return_outputs=False):
 labels = inputs.pop("labels")
 outputs = model(**inputs)
 logits = outputs.logits
 loss_fct = torch.nn.BCEWithLogitsLoss()
 loss = loss_fct(logits.view(-1, self.model.config.num_labels),
 labels.float().view(-1, self.model.config.num_labels))
 return (loss, outputs) if return_outputs else loss

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to("cuda")

In [None]:
def accuracy_threshold(y_pred, y_true, threshold=.5, sigmoid=True):
 y_pred = torch.from_numpy(y_pred)
 y_true = torch.from_numpy(y_true)

 if sigmoid:
 y_pred = y_pred.sigmoid()

 return ((y_pred > threshold) == y_true.bool()).float().mean().item()

In [None]:
def compute_metrics(eval_pred):
 predictions, labels = eval_pred
 return {'accuracy_thresh': accuracy_threshold(predictions, labels)}

In [None]:
args = TrainingArguments(
 output_dir="vaers",
 evaluation_strategy="epoch",
 learning_rate=2e-5,
 per_device_train_batch_size=BATCH_SIZE,
 per_device_eval_batch_size=BATCH_SIZE,
 num_train_epochs=EPOCHS,
 weight_decay=.01,
 report_to=["wandb"]
)

In [None]:
multi_label_trainer = MultiLabelTrainer(
 model, 
 args, 
 train_dataset=ds_enc["train"], 
 eval_dataset=ds_enc["test"], 
 compute_metrics=compute_metrics, 
 tokenizer=tokenizer
)

In [None]:
multi_label_trainer.evaluate()

In [None]:
multi_label_trainer.train()

### Evaluation

We instantiate a classifier `pipeline` and push it to CUDA.

In [None]:
classifier = pipeline("text-classification", 
 model, 
 tokenizer=tokenizer, 
 device="cuda:0")

We use the same tokenizer used for training to tokenize/encode the validation set.

In [None]:
test_encodings = tokenizer.batch_encode_plus(dataset["validate"]["text"], 
 max_length=255, 
 pad_to_max_length=True, 
 return_token_type_ids=True, 
 truncation=True)

Once we've made the data loadable by putting it into a `DataLoader`, we 

In [None]:
test_data = torch.utils.data.TensorDataset(torch.tensor(test_encodings['input_ids']), 
 torch.tensor(test_encodings['attention_mask']), 
 torch.tensor(ds_enc["validate"]["labels"]), 
 torch.tensor(test_encodings['token_type_ids']))
test_dataloader = torch.utils.data.DataLoader(test_data, 
 sampler=torch.utils.data.SequentialSampler(test_data), 
 batch_size=BATCH_SIZE)

In [None]:
model.eval()

logit_preds, true_labels, pred_labels, tokenized_texts = [], [], [], []

for i, batch in enumerate(test_dataloader):
 batch = tuple(t.to(device) for t in batch)
 # Unpack the inputs from our dataloader
 b_input_ids, b_input_mask, b_labels, b_token_types = batch
 
 with torch.no_grad():
 outs = model(b_input_ids, attention_mask=b_input_mask)
 b_logit_pred = outs[0]
 pred_label = torch.sigmoid(b_logit_pred)

 b_logit_pred = b_logit_pred.detach().cpu().numpy()
 pred_label = pred_label.to('cpu').numpy()
 b_labels = b_labels.to('cpu').numpy()

 tokenized_texts.append(b_input_ids)
 logit_preds.append(b_logit_pred)
 true_labels.append(b_labels)
 pred_labels.append(pred_label)

# Flatten outputs
tokenized_texts = [item for sublist in tokenized_texts for item in sublist]
pred_labels = [item for sublist in pred_labels for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]

# Converting flattened binary values to boolean values
true_bools = [tl == 1 for tl in true_labels]
pred_bools = [pl > 0.50 for pl in pred_labels] 

We create a classification report:

In [None]:
print('Test F1 Accuracy: ', f1_score(true_bools, pred_bools, average='micro'))
print('Test Flat Accuracy: ', accuracy_score(true_bools, pred_bools), '\n')
clf_report = classification_report(true_bools, pred_bools, target_names=CLASS_NAMES)
print(clf_report)

Finally, we render a 'head to head' comparison table that maps each text prediction to actual and predicted labels.

In [None]:
# Creating a map of class names from class numbers
idx2label = dict(zip(range(len(CLASS_NAMES)), CLASS_NAMES))

In [None]:
true_label_idxs, pred_label_idxs = [], []

for vals in true_bools:
 true_label_idxs.append(np.where(vals)[0].flatten().tolist())
for vals in pred_bools:
 pred_label_idxs.append(np.where(vals)[0].flatten().tolist())

In [None]:
true_label_texts, pred_label_texts = [], []

for vals in true_label_idxs:
 if vals:
 true_label_texts.append([idx2label[val] for val in vals])
 else:
 true_label_texts.append(vals)

for vals in pred_label_idxs:
 if vals:
 pred_label_texts.append([idx2label[val] for val in vals])
 else:
 pred_label_texts.append(vals)

In [None]:
symptom_texts = [tokenizer.decode(text,
 skip_special_tokens=True,
 clean_up_tokenization_spaces=False) for text in tokenized_texts]

In [None]:
comparisons_df = pd.DataFrame({'symptom_text': symptom_texts, 
 'true_labels': true_label_texts, 
 'pred_labels':pred_label_texts})
comparisons_df.to_csv('comparisons.csv')
comparisons_df

### Shapley analysis

In [None]:
explainer = shap.Explainer(classifier, output_names=CLASS_NAMES)

In [None]:
shap_values = explainer(dataset["validate"]["text"][1:2])

In [None]:
shap.plots.text(shap_values)