Spaces:
Build error
Build error
File size: 3,342 Bytes
8cbd12d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
from transformers import pipeline
import gradio as gr
import torch
import numpy as np
import librosa
import matplotlib.pyplot as plt
import noisereduce
model_id = "fydhfzh/hubert-classifier-aug-fold-3"
pipe = pipeline("audio-classification", model=model_id)
def get_binary_values():
binary_values = []
for i in range(1, 29):
binary_rep = format(i, '05b')
for i in range(1, 4):
binary_harakat = format(i, '02b')
binary_values.append(binary_rep + binary_harakat)
return binary_values
binary_values = get_binary_values()
arabic_letters = [
"ุฃู", "ุฅู", "ุฃู",
"ุจู", "ุจู", "ุจู",
"ุชู", "ุชู", "ุชู",
"ุซู", "ุซู", "ุซู",
"ุฌู", "ุฌู", "ุฌู",
"ุญู", "ุญู", "ุญู",
"ุฎู", "ุฎู", "ุฎู",
"ุฏู", "ุฏู", "ุฏู",
"ุฐู", "ุฐู", "ุฐู",
"ุฑู", "ุฑู", "ุฑู",
"ุฒู", "ุฒู", "ุฒู",
"ุณู", "ุณู", "ุณู",
"ุดู", "ุดู", "ุดู",
"ุตู", "ุตู", "ุตู",
"ุถู", "ุถู", "ุถู",
"ุทู", "ุทู", "ุทู",
"ุธู", "ุธู", "ุธู",
"ุนู", "ุนู", "ุนู",
"ุบู", "ุบู", "ุบู",
"ูู", "ูู", "ูู",
"ูู", "ูู", "ูู",
"ูู", "ูู", "ูู",
"ูู", "ูู", "ูู",
"ู
ู", "ู
ู", "ู
ู",
"ูู", "ูู", "ูู",
"ูู", "ูู", "ูู",
"ูู", "ูู", "ูู",
"ูู", "ูู", "ูู"
]
arabic_representation = dict(zip(binary_values, arabic_letters))
arabic_representation
def split_input(raw_input):
mse = librosa.feature.rms(y=raw_input, frame_length=2048, hop_length=512) ** 2
mse_db = librosa.core.power_to_db(mse.squeeze(), ref=np.min, top_db=None)
mse_db = mse_db[mse_db != 0]
percentile_param = 10
extra_db_param = 0
threshold = np.percentile(mse_db, percentile_param) + extra_db_param
print(threshold)
intervals = librosa.effects.split(y=raw_input, top_db=threshold) # top_db = 60 - threshold
splitted_input = []
for i, (start, end) in enumerate(intervals):
# Add overlapping frames both for trail and lead to ensure good split result
overlap = 2000
start = start - overlap if start - overlap >= 0 else 0
end = end + overlap if end + overlap <= len(raw_input) else len(raw_input)
split_audio = raw_input[start:end]
if len(split_audio) < 16000:
side_len = (16000 - len(split_audio))/2
pad_width = (int(side_len), int(side_len))
split_audio = np.pad(split_audio, pad_width=pad_width, mode='constant', constant_values=(0, 0))
else:
split_audio = split_audio[0:16000]
splitted_input.append(split_audio)
return splitted_input
def process_audio(filepath):
audio, sr = librosa.load(filepath, sr=16000)
audio = noisereduce.reduce_noise(audio, sr)
audio = librosa.util.normalize(audio)
audios = split_input(audio)
return audios
def classify_utterances(filepath):
audios = process_audio(filepath)
output = [pipe(audio)[0] for audio in audios]
predictions = [arabic_representation[x['label']] for x in output]
return ' '.join(predictions)
demo = gr.Blocks()
mic_classification = gr.Interface(
fn=classify_utterances,
inputs=gr.Audio(sources='microphone', type='filepath'),
outputs=gr.Textbox()
)
file_classification = gr.Interface(
fn=classify_utterances,
inputs=gr.Audio(sources='upload', type='filepath'),
outputs=gr.Textbox()
)
with demo:
gr.TabbedInterface(
[mic_classification, file_classification],
['Classify Microphone', 'Classify Audio File']
)
demo.launch() |