## Gathering NER Dataset

In [None]:
from datasets import DatasetDict
from transformers import AutoTokenizer

dataset = DatasetDict.load_from_disk().remove_columns(["token_type_ids", "attention_mask"])

tokenizer = AutoTokenizer.from_pretrained("./../tokenizer")
tokenizer.pad_token_id = 0
tokenizer.pad_token = "<|padding|>"
tokenizer.padding_size = "right"

# new tokens for prompting
num_new_tokens = tokenizer.add_tokens(["<|startofprompt|>", "<|sepofprompt|>", "<|endofprompt|>"])
# new tokens for entities
tokenizer.add_tokens(["<|entity:PER|>", "<|entity:LOC|>", "<|entity:ORG|>", "<|entity|>", "<|detectentities|>"])
# new tokens for images
tokenizer.add_tokens(["<|startofimage|>", "<|endofimage|>"])
tokenizer.add_tokens([ f"<|image:{tkn}|>" for tkn in range(16000)])

tokenizer.save_pretrained("./tokenizer")

print("Total Vocab Size:", len(tokenizer))

In [None]:
tokenizer = AutoTokenizer.from_pretrained("./tokenizer")

In [None]:
import numpy as np
from tqdm import tqdm
import string
import os
import re

audio_paths = sorted(os.listdir("./mp3"))
txt_paths = sorted(os.listdir("./txt"))
data = np.load("tokens.npz")
audio_tokens = [data[key] for key in data.keys()]

In [None]:
def tag_entities(text):
 
 patterns = {
 "PER": r'\|(.*?)\]',
 "LOC": r'\$(.*?)\]',
 "ORG": r'\{(.*?)\]'
 }
 
 entities = []

 for entity, pattern in patterns.items():
 matches = re.findall(pattern, text)
 text = re.sub(pattern, lambda m: f'<|entity:{entity}|>{m.group(1)}<|entity|>', text)
 entities += matches

 return text, entities

data = []

for idx in tqdm(range(len(txt_paths))):
 
 with open(os.path.join("./txt", txt_paths[idx])) as f:
 txt = f.read()
 
 text, entities = tag_entities(txt.lower())
 
 audio_token = audio_tokens[idx]
 
 prompt = "".join([f"<|audio:{tkn}|>" for tkn in audio_token]) + "<|detectentities|><|startofprompt|><|endofprompt|>" + "<|startoftranscript|>" + text + "<|endoftranscript|>"
 
 try:
 outputs = tokenizer(prompt, truncation=True, padding="max_length", max_length=2048)
 data.append({
 "audio_tokens": audio_token,
 "raw_text": text,
 "transcript": txt.translate(str.maketrans('', '', string.punctuation)).lower(),
 "entities": entities,
 "prompt": prompt,
 "input_ids": outputs["input_ids"],
 "attention_mask": output["attention_mask"]
 })
 except:
 print(idx)
 continue
 
from datasets import Dataset
import pandas as pd

ds = Dataset.from_pandas(pd.DataFrame(data))

ds.save_to_disk("entity_tokenized")
ds.push_to_hub("darshanmakwana/entity_tokenized")

## Validating Model

In [11]:
from transformers import GPT2LMHeadModel, AutoTokenizer
from datasets import Dataset
import torch

dataset_name = "entity_tokenized"
tokenizer_path = "./../tokenizer"
max_length = 2048
device = "cuda:0"
dtype = torch.float16

dataset = Dataset.load_from_disk(dataset_name)

tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
tokenizer.pad_token_id = 0
tokenizer.pad_token = "<|padding|>"
tokenizer.padding_side = "left"

# new tokens for prompting
num_new_tokens = tokenizer.add_tokens(["<|startofprompt|>", "<|sepofprompt|>", "<|endofprompt|>"])
# new tokens for entities
tokenizer.add_tokens(["<|entity:PER|>", "<|entity:LOC|>", "<|entity:ORG|>", "<|entity|>", "<|detectentities|>"])

model = GPT2LMHeadModel.from_pretrained("./out/checkpoint-20000").to(device).to(dtype).eval()

In [21]:
sum([param.numel() for param in model.parameters()]) / (1024 * 1024)

114.073974609375

In [12]:
from eval_model import process
from math import ceil
from tqdm import tqdm
import re

def extract_entities(text):
 
 patterns = {
 "PER": r'<\|entity:PER\|>(.*?)<\|entity\|>',
 "LOC": r'<\|entity:LOC\|>(.*?)<\|entity\|>',
 "ORG": r'<\|entity:ORG\|>(.*?)<\|entity\|>'
 }
 
 entities = []

 for entity, pattern in patterns.items():
 matches = re.findall(pattern, text)
 text = re.sub(pattern, lambda m: f'{m.group(1)}', text)
 entities += [process(match) for match in matches]

 return text, entities

def preprocess(sample):
 prompt = "".join([f"<|audio:{tkn}|>" for tkn in sample["audio_tokens"]]) + "<|detectentities|><|startofprompt|><|endofprompt|>" + "<|startoftranscript|>"
 return {"prompt": prompt}

dataset = dataset.map(preprocess)
dataset = dataset.select(list(range(0, 1000)))

eot_token = tokenizer.encode("<|endoftranscript|>")[0]

batch_size = 128
texts = []
tp = 0
fp = 0
tn = 0

for idx in tqdm(range(ceil(len(dataset)/batch_size))):

 input_ids = tokenizer(dataset[idx * batch_size: (idx + 1) * batch_size]["prompt"], return_tensors="pt", padding=True, truncation=True).input_ids.to(model.device)
 par = input_ids.shape[-1]

 generations = model.generate(
 input_ids,
 max_new_tokens=max_length,
 eos_token_id = eot_token
 )
 texts += tokenizer.batch_decode(generations[:, par:], skip_special_tokens=True)

# transcript, pred_entities = extract_entities(transcripts[0])
 
# entities = sample["entities"]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:27<00:00, 3.42s/it]


In [13]:
tp = 0
fp = 0
fn = 0

for idx in tqdm(range(len(dataset))):
 
 transcript, entities = extract_entities(texts[idx])

 for entity in entities:
 if entity in dataset[idx]["entities"]:
 tp += 1
 else:
 fp += 1
 for entity in dataset[idx]["entities"]:
 if entity not in entities:
 fn += 1
 
pre = tp / (tp + fp) * 100
recall = tp / (tp + fn) * 100
print("Precision:", pre)
print("Recall:", recall)
print("F1 Score:", 2 / ((1/pre) + (1/recall)))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:04<00:00, 241.04it/s]

Precision: 69.53846153846153
Recall: 69.32515337423312
F1 Score: 69.43164362519201





In [None]:
## Train Iter Precision Recall F1 Score
 16000 68.80 69.27 69.03
 17000 72.92 70.78 71.83
 18000 76.78 75.34 76.05
 19000 81.78 80.92 81.34
 20000 85.05 80.74 82.84

In [16]:
2 / ((1/81.78) + (1/80.92))

81.34772710510141