CS337_demo / app.py
ThuyNT03's picture
Upload 39 files
7aac284
import librosa
import joblib
from keras.models import load_model
import numpy as np
import pandas as pd
import gradio as gr
import h5py
TF_ENABLE_ONEDNN_OPTS=0
root_path ="./model/"
num2label = {0:"Neutral", 1: "Calm", 2:"Happy", 3:"Sad", 4:"Angry", 5:"Fearful", 6:"Disgust", 7:"Surprised"}
SVM_spectral = joblib.load(root_path + "SVM_spectral.joblib")
SVM_prosodic = joblib.load(root_path + "SVM_prosodic.joblib")
SVM_full = joblib.load(root_path + "SVM_full.joblib")
SVM_mfcc = joblib.load(root_path + "SVM_mfcc.joblib")
NB_spectral = joblib.load(root_path + "NB_spectral.joblib")
NB_prosodic = joblib.load(root_path + "NB_prosodic.joblib")
NB_full = joblib.load(root_path + "NB_full.joblib")
NB_mfcc = joblib.load(root_path + "NB_mfcc.joblib")
DT_spectral = joblib.load(root_path + "DT_spectral.joblib")
DT_prosodic = joblib.load(root_path + "DT_prosodic.joblib")
DT_full = joblib.load(root_path + "DT_full.joblib")
DT_mfcc = joblib.load(root_path + "DT_mfcc.joblib")
MLP_spectral = joblib.load(root_path + "MLP_spectral.joblib")
MLP_prosodic = joblib.load(root_path + "MLP_prosodic.joblib")
MLP_full = joblib.load(root_path + "MLP_full.joblib")
MLP_mfcc = joblib.load(root_path + "MLP_mfcc.joblib")
RF_spectral = joblib.load(root_path + "RF_spectral.joblib")
RF_prosodic = joblib.load(root_path + "RF_prosodic.joblib")
RF_full = joblib.load(root_path + "RF_full.joblib")
RF_mfcc = joblib.load(root_path + "RF_mfcc.joblib")
def load_model_from_h5(file_path):
with h5py.File(file_path, 'r') as file:
model = load_model(file, compile=False)
return model
LSTM_spectral = load_model_from_h5(root_path + "LSTM_spectral.h5")
LSTM_prosodic = load_model_from_h5(root_path + "LSTM_prosodic.h5")
LSTM_full = load_model_from_h5(root_path + "LSTM_full.h5")
LSTM_mfcc = load_model_from_h5(root_path + "LSTM_mfcc.h5")
LSTM_CNN_spectral = load_model_from_h5(root_path + "LSTM_CNN_spectral.h5")
LSTM_CNN_prosodic = load_model_from_h5(root_path + "LSTM_CNN_prosodic.h5")
LSTM_CNN_full = load_model_from_h5(root_path + "LSTM_CNN_full.h5")
LSTM_CNN_mfcc = load_model_from_h5(root_path + "LSTM_CNN_mfcc.h5")
CNN_spectral = load_model_from_h5(root_path + "CNN_spectral.h5")
CNN_prosodic = load_model_from_h5(root_path + "CNN_prosodic.h5")
CNN_full = load_model_from_h5(root_path + "CNN_full.h5")
CNN_mfcc = load_model_from_h5(root_path + "CNN_mfcc.h5")
total_model = {"SVM": {'mfcc': SVM_mfcc, 'spectral': SVM_spectral, 'prosodic':SVM_prosodic, 'full':SVM_full},
"NB": {'mfcc': NB_mfcc, 'spectral': NB_spectral, 'prosodic': NB_prosodic, 'full': NB_full},
"DT": {'mfcc': DT_mfcc, 'spectral': DT_spectral, 'prosodic': DT_prosodic, 'full': DT_full},
"MLP": {'mfcc': MLP_mfcc, 'spectral': MLP_spectral, 'prosodic':MLP_prosodic, 'full':MLP_full},
"RF": {'mfcc': RF_mfcc, 'spectral': RF_spectral, 'prosodic': RF_prosodic, 'full': RF_full},
"LSTM": {'mfcc': LSTM_mfcc, 'spectral': LSTM_spectral, 'prosodic': LSTM_prosodic, 'full': LSTM_full},
"LSTM_CNN": {'mfcc': LSTM_CNN_mfcc, 'spectral': LSTM_CNN_spectral, 'prosodic': LSTM_CNN_prosodic, 'full': LSTM_CNN_full},
"CNN": {'mfcc': CNN_mfcc, 'spectral': CNN_spectral, 'prosodic': CNN_prosodic, 'full': CNN_full}
}
spectral_scaler = joblib.load(root_path + 'spectral_features_standard_scaler.joblib')
prosodic_scaler = joblib.load(root_path + 'prosodic_features_standard_scaler.joblib')
full_scaler = joblib.load(root_path + 'full_features_standard_scaler.joblib')
mfcc_scaler = joblib.load(root_path + 'mfcc_features_standard_scaler.joblib')
scaler = {'mfcc': mfcc_scaler, 'spectral': spectral_scaler, 'prosodic': prosodic_scaler, 'full': full_scaler}
def Load_audio(audio_path):
# Đọc file âm thanh và tần số lấy mẫu
y, sr = librosa.load(audio_path, sr=48000)
return y
# Bạn có thể sử dụng y và sr cho các mục đích xử lý âm thanh tiếp theo
def Spectral_extract_features(audio): # data là một file âm thanh thôi
mfccs = librosa.feature.mfcc(y=audio, n_mfcc=40) # sr=sr,
chroma = librosa.feature.chroma_stft(y=audio)
spectral_contrast = librosa.feature.spectral_contrast(y=audio)
tonal_centroid = librosa.feature.tonnetz(y=audio)
mel_spectrogram = librosa.feature.melspectrogram(y=audio)
feature_vector = np.concatenate((mfccs.mean(axis=1), chroma.mean(axis=1), spectral_contrast.mean(axis=1), tonal_centroid.mean(axis = 1), mel_spectrogram.mean(axis = 1)))
return np.array(feature_vector)
def mfcc_extract_features(audio):
mfccs = librosa.feature.mfcc(y=audio, n_mfcc=40) # sr=sr,
mfcc_features = mfccs.mean(axis=1)
return mfcc_features
def Prosodic_extract_features(audio):
pitch, _ = librosa.piptrack(y=audio, n_fft=128, hop_length = 512)
#print("pitch:", pitch.mean(axis=1)) # ok
duration = librosa.get_duration(y=audio)
#print("duration:",duration) # ok
energy = librosa.feature.rms(y=audio)
#print("energy:", energy.shape)
duration = np.array([duration]).reshape(1,1)
#print("duration:", duration.shape)
feature_vector = np.concatenate((pitch.mean(axis=1), duration.mean(axis=1), energy.mean(axis=1)))
return np.array(feature_vector)
def Spectral_Prosodic(audio):
Spectral_features = Spectral_extract_features(audio)
Prosodic_features = Prosodic_extract_features(audio)
full_features = np.concatenate((Spectral_features, Prosodic_features))
return full_features
def Total_features(audio, scaler):
features = {}
features['spectral'] = scaler['spectral'].transform(Spectral_extract_features(audio).reshape(1, -1))
features['prosodic'] = scaler['prosodic'].transform(Prosodic_extract_features(audio).reshape(1, -1))
features['full'] = scaler['full'].transform(Spectral_Prosodic(audio).reshape(1, -1))
features['mfcc'] = scaler['mfcc'].transform(mfcc_extract_features(audio).reshape(1, -1))
return features
def total_predict(feature, total_model): # feature là một dict tổng hợp 4 loại đặc trưng
result = {'mfcc': {}, 'spectral' : {}, 'prosodic': {}, 'full': {} }
f_keys = ['mfcc', 'spectral', 'prosodic', 'full']
ML = ['SVM', 'NB', 'DT', 'MLP', 'RF']
m_keys = ['SVM', 'NB', 'DT', 'MLP', 'RF', 'LSTM', 'LSTM_CNN', 'CNN']
for f in f_keys:
for m in m_keys:
try:
if m in ML:
model = total_model[m][f]
result[f][m] = num2label[model.predict(feature[f])[0]]
else:
model = total_model[m][f]
temp = [np.array(feature[f]).reshape((1,-1))]
y_pred = model.predict(temp)
y_pred_labels = np.argmax(y_pred, axis=1)[0]
result[f][m] = num2label[y_pred_labels]
except:
print(f, m)
return result
# def main_function(audio_path, scaler, total_model):
# audio = Load_audio(audio_path)
# feature = Total_features(audio, scaler)
# labels = total_predict(feature, total_model)
# table = pd.DataFrame.from_dict(labels).T
# return table
def main_function(audio_path, scaler, total_model):
audio = Load_audio(audio_path)
feature = Total_features(audio, scaler)
labels = total_predict(feature, total_model)
table = pd.DataFrame.from_dict(labels).T
table.insert(0, 'Đặc trưng', ['mfcc', 'spectral', 'prosodic', 'full'])
return table
def main_interface(audio_file):
# print("đường dẫn", audio_file)
# sr, audio_data = audio_file
# print(sr, audio_data)
# if 1:
# audio_data = audio_data.astype(float)
# audio_data = librosa.resample(audio_data, orig_sr=sr, target_sr=48000)
# print("đã đọc lại file")
# else:
# pass
# # audio_path = "./uploaded.wav"
# # write(audio_path, 48000, np.int16(audio_data))
# # print("đã lưu")
result_table = main_function(audio_file, scaler, total_model)
return result_table
# Create Gradio Interface
iface = gr.Interface(
fn=main_interface,
inputs=gr.Audio(type= 'filepath'),
outputs=gr.Dataframe(headers=['Đặc trưng', 'SVM', 'NB', 'DT', 'MLP', 'RF', 'LSTM', 'LSTM_CNN', 'CNN']),
)
# Launch the Gradio Interface
iface.launch()