from transformers import pipeline import gradio as gr import torch import numpy as np import librosa import matplotlib.pyplot as plt import noisereduce model_id = "fydhfzh/hubert-classifier-aug-fold-3" pipe = pipeline("audio-classification", model=model_id) def get_binary_values(): binary_values = [] for i in range(1, 29): binary_rep = format(i, '05b') for i in range(1, 4): binary_harakat = format(i, '02b') binary_values.append(binary_rep + binary_harakat) return binary_values binary_values = get_binary_values() arabic_letters = [ "أَ", "إِ", "أُ", "بَ", "بِ", "بُ", "تَ", "تِ", "تُ", "ثَ", "ثِ", "ثُ", "جَ", "جِ", "جُ", "حَ", "حِ", "حُ", "خَ", "خِ", "خُ", "دَ", "دِ", "دُ", "ذَ", "ذِ", "ذُ", "رَ", "رِ", "رُ", "زَ", "زِ", "زُ", "سَ", "سِ", "سُ", "شَ", "شِ", "شُ", "صَ", "صِ", "صُ", "ضَ", "ضِ", "ضُ", "طَ", "طِ", "طُ", "ظَ", "ظِ", "ظُ", "عَ", "عِ", "عُ", "غَ", "غِ", "غُ", "فَ", "فِ", "فُ", "قَ", "قِ", "قُ", "كَ", "كِ", "كُ", "لَ", "لِ", "لُ", "مَ", "مِ", "مُ", "نَ", "نِ", "نُ", "هَ", "هِ", "هُ", "وَ", "وِ", "وُ", "يَ", "يِ", "يُ" ] arabic_representation = dict(zip(binary_values, arabic_letters)) arabic_representation def split_input(raw_input): mse = librosa.feature.rms(y=raw_input, frame_length=2048, hop_length=512) ** 2 mse_db = librosa.core.power_to_db(mse.squeeze(), ref=np.min, top_db=None) mse_db = mse_db[mse_db != 0] percentile_param = 10 extra_db_param = 0 threshold = np.percentile(mse_db, percentile_param) + extra_db_param print(threshold) intervals = librosa.effects.split(y=raw_input, top_db=threshold) # top_db = 60 - threshold splitted_input = [] for i, (start, end) in enumerate(intervals): # Add overlapping frames both for trail and lead to ensure good split result overlap = 2000 start = start - overlap if start - overlap >= 0 else 0 end = end + overlap if end + overlap <= len(raw_input) else len(raw_input) split_audio = raw_input[start:end] if len(split_audio) < 16000: side_len = (16000 - len(split_audio))/2 pad_width = (int(side_len), int(side_len)) split_audio = np.pad(split_audio, pad_width=pad_width, mode='constant', constant_values=(0, 0)) else: split_audio = split_audio[0:16000] splitted_input.append(split_audio) return splitted_input def process_audio(filepath): audio, sr = librosa.load(filepath, sr=16000) audio = noisereduce.reduce_noise(audio, sr) audio = librosa.util.normalize(audio) audios = split_input(audio) return audios def classify_utterances(filepath): audios = process_audio(filepath) output = [pipe(audio)[0] for audio in audios] predictions = [arabic_representation[x['label']] for x in output] return ' '.join(predictions) demo = gr.Blocks() mic_classification = gr.Interface( fn=classify_utterances, inputs=gr.Audio(sources='microphone', type='filepath'), outputs=gr.Textbox() ) file_classification = gr.Interface( fn=classify_utterances, inputs=gr.Audio(sources='upload', type='filepath'), outputs=gr.Textbox() ) with demo: gr.TabbedInterface( [mic_classification, file_classification], ['Classify Microphone', 'Classify Audio File'] ) demo.launch()