fydhfzh's picture
initial commit
8cbd12d
raw
history blame
3.34 kB
from transformers import pipeline
import gradio as gr
import torch
import numpy as np
import librosa
import matplotlib.pyplot as plt
import noisereduce
model_id = "fydhfzh/hubert-classifier-aug-fold-3"
pipe = pipeline("audio-classification", model=model_id)
def get_binary_values():
binary_values = []
for i in range(1, 29):
binary_rep = format(i, '05b')
for i in range(1, 4):
binary_harakat = format(i, '02b')
binary_values.append(binary_rep + binary_harakat)
return binary_values
binary_values = get_binary_values()
arabic_letters = [
"ุฃูŽ", "ุฅู", "ุฃู",
"ุจูŽ", "ุจู", "ุจู",
"ุชูŽ", "ุชู", "ุชู",
"ุซูŽ", "ุซู", "ุซู",
"ุฌูŽ", "ุฌู", "ุฌู",
"ุญูŽ", "ุญู", "ุญู",
"ุฎูŽ", "ุฎู", "ุฎู",
"ุฏูŽ", "ุฏู", "ุฏู",
"ุฐูŽ", "ุฐู", "ุฐู",
"ุฑูŽ", "ุฑู", "ุฑู",
"ุฒูŽ", "ุฒู", "ุฒู",
"ุณูŽ", "ุณู", "ุณู",
"ุดูŽ", "ุดู", "ุดู",
"ุตูŽ", "ุตู", "ุตู",
"ุถูŽ", "ุถู", "ุถู",
"ุทูŽ", "ุทู", "ุทู",
"ุธูŽ", "ุธู", "ุธู",
"ุนูŽ", "ุนู", "ุนู",
"ุบูŽ", "ุบู", "ุบู",
"ููŽ", "ูู", "ูู",
"ู‚ูŽ", "ู‚ู", "ู‚ู",
"ูƒูŽ", "ูƒู", "ูƒู",
"ู„ูŽ", "ู„ู", "ู„ู",
"ู…ูŽ", "ู…ู", "ู…ู",
"ู†ูŽ", "ู†ู", "ู†ู",
"ู‡ูŽ", "ู‡ู", "ู‡ู",
"ูˆูŽ", "ูˆู", "ูˆู",
"ูŠูŽ", "ูŠู", "ูŠู"
]
arabic_representation = dict(zip(binary_values, arabic_letters))
arabic_representation
def split_input(raw_input):
mse = librosa.feature.rms(y=raw_input, frame_length=2048, hop_length=512) ** 2
mse_db = librosa.core.power_to_db(mse.squeeze(), ref=np.min, top_db=None)
mse_db = mse_db[mse_db != 0]
percentile_param = 10
extra_db_param = 0
threshold = np.percentile(mse_db, percentile_param) + extra_db_param
print(threshold)
intervals = librosa.effects.split(y=raw_input, top_db=threshold) # top_db = 60 - threshold
splitted_input = []
for i, (start, end) in enumerate(intervals):
# Add overlapping frames both for trail and lead to ensure good split result
overlap = 2000
start = start - overlap if start - overlap >= 0 else 0
end = end + overlap if end + overlap <= len(raw_input) else len(raw_input)
split_audio = raw_input[start:end]
if len(split_audio) < 16000:
side_len = (16000 - len(split_audio))/2
pad_width = (int(side_len), int(side_len))
split_audio = np.pad(split_audio, pad_width=pad_width, mode='constant', constant_values=(0, 0))
else:
split_audio = split_audio[0:16000]
splitted_input.append(split_audio)
return splitted_input
def process_audio(filepath):
audio, sr = librosa.load(filepath, sr=16000)
audio = noisereduce.reduce_noise(audio, sr)
audio = librosa.util.normalize(audio)
audios = split_input(audio)
return audios
def classify_utterances(filepath):
audios = process_audio(filepath)
output = [pipe(audio)[0] for audio in audios]
predictions = [arabic_representation[x['label']] for x in output]
return ' '.join(predictions)
demo = gr.Blocks()
mic_classification = gr.Interface(
fn=classify_utterances,
inputs=gr.Audio(sources='microphone', type='filepath'),
outputs=gr.Textbox()
)
file_classification = gr.Interface(
fn=classify_utterances,
inputs=gr.Audio(sources='upload', type='filepath'),
outputs=gr.Textbox()
)
with demo:
gr.TabbedInterface(
[mic_classification, file_classification],
['Classify Microphone', 'Classify Audio File']
)
demo.launch()