## Model Evaluation

Hi, there welcome to my notebook! 👋

This notebook is all about evaluating different models using a small subset of a larger Dataset.

This Notebook is self contained meaning that expect for installing necessary libraries you can run all cells in order and everything should work
If not, feel free to leave me a message and i'll give my best to fix the issue

All you need for this notebook to work is a **HuggingFace token**.

If you don't know how to find it.

Go to your Hugging Face
> Profile -> Settings -> Access Tokens -> + Create new token

You can find the Notebook in Google Colab [here](https://colab.research.google.com/drive/1awfo4_Llrg-aypEc_MdJXcqQMj3r_Fy2?usp=share_link)

### 1. Import all necessary libraries

In [None]:
from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from huggingface_hub import login
from datasets import load_dataset
from datasets import Audio
from tqdm import tqdm
import evaluate
import torch

### 2. Log in & set constants

In [None]:
# Login
login("hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")

# Set constants
N_SAMPLES = 100

### 3. Load Dataset & Metric

In [None]:
# Load the Dataset
dataset = load_dataset("librispeech_asr", "clean", split="test", streaming=True, token=True, trust_remote_code=True)
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
dataset = dataset.take(N_SAMPLES)

# Load the Evaluation Metric
wer_metric = evaluate.load("wer")

# Create Dictionary to Store Results
results = {
 "facebook/wav2vec2-base-960h":0,
 "openai/whisper-tiny.en":0,
 "facebook/s2t-medium-librispeech-asr":0
}

### 4. Evaluate the first Model

In [None]:
# Load the 1. ASR Model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")


# Run Inference For the First Model
predictions = []
references = []

for i, item in tqdm(enumerate(dataset), total=N_SAMPLES):
 input_values = processor(item["audio"]["array"], sampling_rate=16000, return_tensors="pt", padding="longest").input_values # Batch size 1
 logits = model(input_values).logits
 predicted_ids = torch.argmax(logits, dim=-1)
 transcription = processor.batch_decode(predicted_ids)
 predictions.append(transcription[0])
 references.append(item["text"])



wer = wer_metric.compute(references=references, predictions=predictions)
wer = round(100 * wer, 2)
print("WER:", wer)
results["facebook/wav2vec2-base-960h"] = wer

### 5. Evaluate the second Model


In [None]:
# Load the 2. ASR Model
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")


# Run Inference For the First Model
predictions = []
references = []

for i, item in tqdm(enumerate(dataset), total=N_SAMPLES):
 input_features = processor(item["audio"]["array"], sampling_rate=16000, return_tensors="pt", padding="longest").input_features # Batch size 1
 predicted_ids = model.generate(input_features=input_features)
 transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
 predictions.append(processor.tokenizer.normalize(transcription[0]))
 references.append(processor.tokenizer.normalize(item["text"]))



wer = wer_metric.compute(references=references, predictions=predictions)
wer = round(100 * wer, 2)
print("WER:", wer)
results["openai/whisper-tiny.en"] = wer

### 6. Evaluate the third Model

In [None]:
# Load the 3. ASR Model
model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-librispeech-asr")
processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-librispeech-asr", do_upper_case=True)


# Run Inference For the First Model
predictions = []
references = []

for i, item in tqdm(enumerate(dataset), total=N_SAMPLES):
 sample = item["audio"]
 features = processor(sample["array"], sampling_rate=16000, padding=True, return_tensors="pt")
 input_features = features.input_features
 attention_mask = features.attention_mask
 gen_tokens = model.generate(input_features=input_features, attention_mask=attention_mask)
 transcription= processor.batch_decode(gen_tokens, skip_special_tokens=True)
 predictions.append(transcription[0])
 references.append(item["text"])



wer = wer_metric.compute(references=references, predictions=predictions)
wer = round(100 * wer, 2)
print("WER:", wer)
results["facebook/s2t-medium-librispeech-asr"] = wer

### 7. Find the winning Model

In [None]:
winning_model = min(results, key=results.get)
min_wer = results[winning_model]

print(f"The model {winning_model} has the lowest WER Score achieved with WER: {min_wer}")