File size: 3,342 Bytes
8cbd12d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
from transformers import pipeline
import gradio as gr
import torch
import numpy as np
import librosa
import matplotlib.pyplot as plt
import noisereduce

model_id = "fydhfzh/hubert-classifier-aug-fold-3"
pipe = pipeline("audio-classification", model=model_id)

def get_binary_values():
	binary_values = []

	for i in range(1, 29):
		binary_rep = format(i, '05b')
		for i in range(1, 4):
			binary_harakat = format(i, '02b')
			binary_values.append(binary_rep + binary_harakat)

	return binary_values

binary_values = get_binary_values()

arabic_letters = [
	"ุฃูŽ", "ุฅู", "ุฃู",
	"ุจูŽ", "ุจู", "ุจู",
	"ุชูŽ", "ุชู", "ุชู",
	"ุซูŽ", "ุซู", "ุซู",
	"ุฌูŽ", "ุฌู", "ุฌู",
	"ุญูŽ", "ุญู", "ุญู",
	"ุฎูŽ", "ุฎู", "ุฎู",
	"ุฏูŽ", "ุฏู", "ุฏู",
	"ุฐูŽ", "ุฐู", "ุฐู",
	"ุฑูŽ", "ุฑู", "ุฑู",
	"ุฒูŽ", "ุฒู", "ุฒู",
	"ุณูŽ", "ุณู", "ุณู",
	"ุดูŽ", "ุดู", "ุดู",
	"ุตูŽ", "ุตู", "ุตู",
	"ุถูŽ", "ุถู", "ุถู",
	"ุทูŽ", "ุทู", "ุทู",
	"ุธูŽ", "ุธู", "ุธู",
	"ุนูŽ", "ุนู", "ุนู",
	"ุบูŽ", "ุบู", "ุบู",
	"ููŽ", "ูู", "ูู",
	"ู‚ูŽ", "ู‚ู", "ู‚ู",
	"ูƒูŽ", "ูƒู", "ูƒู",
	"ู„ูŽ", "ู„ู", "ู„ู",
	"ู…ูŽ", "ู…ู", "ู…ู",
	"ู†ูŽ", "ู†ู", "ู†ู",
	"ู‡ูŽ", "ู‡ู", "ู‡ู",
	"ูˆูŽ", "ูˆู", "ูˆู",
	"ูŠูŽ", "ูŠู", "ูŠู"
]

arabic_representation = dict(zip(binary_values, arabic_letters))
arabic_representation

def split_input(raw_input):
	mse = librosa.feature.rms(y=raw_input, frame_length=2048, hop_length=512) ** 2
	mse_db = librosa.core.power_to_db(mse.squeeze(), ref=np.min, top_db=None)
	mse_db = mse_db[mse_db != 0]
	
	percentile_param = 10
	extra_db_param = 0
	
	threshold = np.percentile(mse_db, percentile_param) + extra_db_param
	print(threshold)

	intervals = librosa.effects.split(y=raw_input, top_db=threshold) # top_db = 60 - threshold
	splitted_input = []
	
	for i, (start, end) in enumerate(intervals):
		# Add overlapping frames both for trail and lead to ensure good split result
		overlap = 2000
		start = start - overlap if start - overlap >= 0 else 0 
		end = end + overlap if end + overlap <= len(raw_input) else len(raw_input)
		split_audio = raw_input[start:end]

		if len(split_audio) < 16000:
			side_len = (16000 - len(split_audio))/2
			pad_width = (int(side_len), int(side_len))
			split_audio = np.pad(split_audio, pad_width=pad_width, mode='constant', constant_values=(0, 0))
		else:
			split_audio = split_audio[0:16000]

		splitted_input.append(split_audio)
		
	return splitted_input

def process_audio(filepath):
	audio, sr = librosa.load(filepath, sr=16000)
	audio = noisereduce.reduce_noise(audio, sr)
	audio = librosa.util.normalize(audio)
	audios = split_input(audio)
	
	return audios
	

def classify_utterances(filepath):
	audios = process_audio(filepath)
	output = [pipe(audio)[0] for audio in audios]
	predictions = [arabic_representation[x['label']] for x in output]  

	return ' '.join(predictions)

demo = gr.Blocks()

mic_classification = gr.Interface(
	fn=classify_utterances,
	inputs=gr.Audio(sources='microphone', type='filepath'),
	outputs=gr.Textbox()
)

file_classification = gr.Interface(
	fn=classify_utterances,
	inputs=gr.Audio(sources='upload', type='filepath'),
	outputs=gr.Textbox()
)

with demo:
    gr.TabbedInterface(
        [mic_classification, file_classification],
        ['Classify Microphone', 'Classify Audio File']
    )

demo.launch()