fydhfzh's picture
initial commit
8cbd12d
from transformers import pipeline
import gradio as gr
import torch
import numpy as np
import librosa
import matplotlib.pyplot as plt
import noisereduce
model_id = "fydhfzh/hubert-classifier-aug-fold-3"
pipe = pipeline("audio-classification", model=model_id)
def get_binary_values():
binary_values = []
for i in range(1, 29):
binary_rep = format(i, '05b')
for i in range(1, 4):
binary_harakat = format(i, '02b')
binary_values.append(binary_rep + binary_harakat)
return binary_values
binary_values = get_binary_values()
arabic_letters = [
"ุฃูŽ", "ุฅู", "ุฃู",
"ุจูŽ", "ุจู", "ุจู",
"ุชูŽ", "ุชู", "ุชู",
"ุซูŽ", "ุซู", "ุซู",
"ุฌูŽ", "ุฌู", "ุฌู",
"ุญูŽ", "ุญู", "ุญู",
"ุฎูŽ", "ุฎู", "ุฎู",
"ุฏูŽ", "ุฏู", "ุฏู",
"ุฐูŽ", "ุฐู", "ุฐู",
"ุฑูŽ", "ุฑู", "ุฑู",
"ุฒูŽ", "ุฒู", "ุฒู",
"ุณูŽ", "ุณู", "ุณู",
"ุดูŽ", "ุดู", "ุดู",
"ุตูŽ", "ุตู", "ุตู",
"ุถูŽ", "ุถู", "ุถู",
"ุทูŽ", "ุทู", "ุทู",
"ุธูŽ", "ุธู", "ุธู",
"ุนูŽ", "ุนู", "ุนู",
"ุบูŽ", "ุบู", "ุบู",
"ููŽ", "ูู", "ูู",
"ู‚ูŽ", "ู‚ู", "ู‚ู",
"ูƒูŽ", "ูƒู", "ูƒู",
"ู„ูŽ", "ู„ู", "ู„ู",
"ู…ูŽ", "ู…ู", "ู…ู",
"ู†ูŽ", "ู†ู", "ู†ู",
"ู‡ูŽ", "ู‡ู", "ู‡ู",
"ูˆูŽ", "ูˆู", "ูˆู",
"ูŠูŽ", "ูŠู", "ูŠู"
]
arabic_representation = dict(zip(binary_values, arabic_letters))
arabic_representation
def split_input(raw_input):
mse = librosa.feature.rms(y=raw_input, frame_length=2048, hop_length=512) ** 2
mse_db = librosa.core.power_to_db(mse.squeeze(), ref=np.min, top_db=None)
mse_db = mse_db[mse_db != 0]
percentile_param = 10
extra_db_param = 0
threshold = np.percentile(mse_db, percentile_param) + extra_db_param
print(threshold)
intervals = librosa.effects.split(y=raw_input, top_db=threshold) # top_db = 60 - threshold
splitted_input = []
for i, (start, end) in enumerate(intervals):
# Add overlapping frames both for trail and lead to ensure good split result
overlap = 2000
start = start - overlap if start - overlap >= 0 else 0
end = end + overlap if end + overlap <= len(raw_input) else len(raw_input)
split_audio = raw_input[start:end]
if len(split_audio) < 16000:
side_len = (16000 - len(split_audio))/2
pad_width = (int(side_len), int(side_len))
split_audio = np.pad(split_audio, pad_width=pad_width, mode='constant', constant_values=(0, 0))
else:
split_audio = split_audio[0:16000]
splitted_input.append(split_audio)
return splitted_input
def process_audio(filepath):
audio, sr = librosa.load(filepath, sr=16000)
audio = noisereduce.reduce_noise(audio, sr)
audio = librosa.util.normalize(audio)
audios = split_input(audio)
return audios
def classify_utterances(filepath):
audios = process_audio(filepath)
output = [pipe(audio)[0] for audio in audios]
predictions = [arabic_representation[x['label']] for x in output]
return ' '.join(predictions)
demo = gr.Blocks()
mic_classification = gr.Interface(
fn=classify_utterances,
inputs=gr.Audio(sources='microphone', type='filepath'),
outputs=gr.Textbox()
)
file_classification = gr.Interface(
fn=classify_utterances,
inputs=gr.Audio(sources='upload', type='filepath'),
outputs=gr.Textbox()
)
with demo:
gr.TabbedInterface(
[mic_classification, file_classification],
['Classify Microphone', 'Classify Audio File']
)
demo.launch()