Spaces:
Build error
Build error
from transformers import pipeline | |
import gradio as gr | |
import torch | |
import numpy as np | |
import librosa | |
import matplotlib.pyplot as plt | |
import noisereduce | |
model_id = "fydhfzh/hubert-classifier-aug-fold-3" | |
pipe = pipeline("audio-classification", model=model_id) | |
def get_binary_values(): | |
binary_values = [] | |
for i in range(1, 29): | |
binary_rep = format(i, '05b') | |
for i in range(1, 4): | |
binary_harakat = format(i, '02b') | |
binary_values.append(binary_rep + binary_harakat) | |
return binary_values | |
binary_values = get_binary_values() | |
arabic_letters = [ | |
"ุฃู", "ุฅู", "ุฃู", | |
"ุจู", "ุจู", "ุจู", | |
"ุชู", "ุชู", "ุชู", | |
"ุซู", "ุซู", "ุซู", | |
"ุฌู", "ุฌู", "ุฌู", | |
"ุญู", "ุญู", "ุญู", | |
"ุฎู", "ุฎู", "ุฎู", | |
"ุฏู", "ุฏู", "ุฏู", | |
"ุฐู", "ุฐู", "ุฐู", | |
"ุฑู", "ุฑู", "ุฑู", | |
"ุฒู", "ุฒู", "ุฒู", | |
"ุณู", "ุณู", "ุณู", | |
"ุดู", "ุดู", "ุดู", | |
"ุตู", "ุตู", "ุตู", | |
"ุถู", "ุถู", "ุถู", | |
"ุทู", "ุทู", "ุทู", | |
"ุธู", "ุธู", "ุธู", | |
"ุนู", "ุนู", "ุนู", | |
"ุบู", "ุบู", "ุบู", | |
"ูู", "ูู", "ูู", | |
"ูู", "ูู", "ูู", | |
"ูู", "ูู", "ูู", | |
"ูู", "ูู", "ูู", | |
"ู ู", "ู ู", "ู ู", | |
"ูู", "ูู", "ูู", | |
"ูู", "ูู", "ูู", | |
"ูู", "ูู", "ูู", | |
"ูู", "ูู", "ูู" | |
] | |
arabic_representation = dict(zip(binary_values, arabic_letters)) | |
arabic_representation | |
def split_input(raw_input): | |
mse = librosa.feature.rms(y=raw_input, frame_length=2048, hop_length=512) ** 2 | |
mse_db = librosa.core.power_to_db(mse.squeeze(), ref=np.min, top_db=None) | |
mse_db = mse_db[mse_db != 0] | |
percentile_param = 10 | |
extra_db_param = 0 | |
threshold = np.percentile(mse_db, percentile_param) + extra_db_param | |
print(threshold) | |
intervals = librosa.effects.split(y=raw_input, top_db=threshold) # top_db = 60 - threshold | |
splitted_input = [] | |
for i, (start, end) in enumerate(intervals): | |
# Add overlapping frames both for trail and lead to ensure good split result | |
overlap = 2000 | |
start = start - overlap if start - overlap >= 0 else 0 | |
end = end + overlap if end + overlap <= len(raw_input) else len(raw_input) | |
split_audio = raw_input[start:end] | |
if len(split_audio) < 16000: | |
side_len = (16000 - len(split_audio))/2 | |
pad_width = (int(side_len), int(side_len)) | |
split_audio = np.pad(split_audio, pad_width=pad_width, mode='constant', constant_values=(0, 0)) | |
else: | |
split_audio = split_audio[0:16000] | |
splitted_input.append(split_audio) | |
return splitted_input | |
def process_audio(filepath): | |
audio, sr = librosa.load(filepath, sr=16000) | |
audio = noisereduce.reduce_noise(audio, sr) | |
audio = librosa.util.normalize(audio) | |
audios = split_input(audio) | |
return audios | |
def classify_utterances(filepath): | |
audios = process_audio(filepath) | |
output = [pipe(audio)[0] for audio in audios] | |
predictions = [arabic_representation[x['label']] for x in output] | |
return ' '.join(predictions) | |
demo = gr.Blocks() | |
mic_classification = gr.Interface( | |
fn=classify_utterances, | |
inputs=gr.Audio(sources='microphone', type='filepath'), | |
outputs=gr.Textbox() | |
) | |
file_classification = gr.Interface( | |
fn=classify_utterances, | |
inputs=gr.Audio(sources='upload', type='filepath'), | |
outputs=gr.Textbox() | |
) | |
with demo: | |
gr.TabbedInterface( | |
[mic_classification, file_classification], | |
['Classify Microphone', 'Classify Audio File'] | |
) | |
demo.launch() |