# Importing the required libraries import streamlit as st import numpy as np import librosa import matplotlib.pyplot as plt from tensorflow.keras.models import Sequential, load_model from tensorflow.keras.layers import Dense, Dropout import io import soundfile as sf from st_audiorec import st_audiorec from scipy.io.wavfile import write, read as wav_read # Define the target speakers in a list. # ALSO THESE ARE FILES THAT WERE NOT USED TO TEST NOR TRAIN THE MODEL # I PULLED THEM OUT JUST SO WE CAN SEE HOW WELL THE MODEL PERFORMS ON # UNSEEN AUDIO FILES. # # There is 4 more for each that still havent been added # here but i already tested it in the notebook. 98% Accuracy target_dictionary = { 0 : ["p225", 'AUDIO_FILES/p225_358.wav'], # Label, Speaker_id, Wav file 1 : ["p226", 'AUDIO_FILES/p226_366.wav'], 2 : ["p228", 'AUDIO_FILES/p228_367.wav'], 3 : ["p236", 'AUDIO_FILES/p236_500.wav'], 4 : ["p237", 'AUDIO_FILES/p237_348.wav'], 5 : ["p241", 'AUDIO_FILES/p241_370.wav'], 6 : ["p249", 'AUDIO_FILES/p249_351.wav'], 7 : ["p257", 'AUDIO_FILES/p257_430.wav'], 8 : ["p304", 'AUDIO_FILES/p304_420.wav'], 9 : ["p326", 'AUDIO_FILES/p326_400.wav'] } # Function to extract features from audio file... (same function from notebook) def extract_feature(file_name): """ Extract features from audio file Args: file_name (str): Path to audio file return: np.array: Feature vector """ X, sample_rate = librosa.core.load(file_name) # load audio file result = np.array([]) # array that stores features mel = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T,axis=0) # calc mel spectogram result = np.hstack((result, mel)) # insert the mel spect into results arr return result # return the feature vector # Function to classify gender (NOT MY CODE) ################################################### # shout out: https://github.com/https://github.com/JoyBis48 # Link to Hugging Face Space: https://huggingface.co/spaces/Cosmos48/Gender-Voice-Recognition # Function to convert audio to spectrogram image. Just so u can see it 2. def audio_to_spectrogram(file_path): y, sr = librosa.load(file_path) mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, hop_length=512) mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max) plt.figure(figsize=(4, 4)) plt.axis('off') plt.imshow(mel_spec_db, aspect='auto', origin='lower') plt.tight_layout() plt.savefig("spectrogram.png") plt.close() def classify_gender(file_path): features = extract_feature(file_path).reshape(1, -1) male_prob = gender_model.predict(features, verbose=0)[0][0] female_prob = 1 - male_prob gender = "male" if male_prob > female_prob else "female" probability = "{:.2f}".format(male_prob) if gender == "male" else "{:.2f}".format(female_prob) return gender, probability # Function to create the gender classification model def create_model(vector_length=128): model = Sequential([ Dense(256, input_shape=(vector_length,), activation='relu'), Dropout(0.3), Dense(256, activation='relu'), Dropout(0.3), Dense(128, activation='relu'), Dropout(0.3), Dense(128, activation='relu'), Dropout(0.3), Dense(64, activation='relu'), Dropout(0.3), Dense(1, activation='sigmoid') ]) model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer='adam') return model # Load the pre-trained model gender_model = create_model() # The saved_model.h5 is the pretrained model that was used. gender_model.load_weights("NEW_MODELS/saved_model.h5") #################################################### # Function to classify the speaker def classify_speaker(file_path): # Extract features from the user recording features = extract_feature(file_path).reshape(1, -1) # Reshaping to match the model input # Predict the probabilities for each of the 10 speakers speaker_probs = model.predict(features, verbose=0)[0] # Identify the most likely speaker by finding the index of the highest probability most_likely_speaker = np.argmax(speaker_probs) probability = speaker_probs[most_likely_speaker] # Probability of the most likely speaker # Map the index to the speaker label speaker = f"Speaker {target_dictionary[most_likely_speaker][0]}" # For users to hear what the voice sounds like if they use their actual voice. wav_file = target_dictionary[most_likely_speaker][1] # Format the probability for better readability probability = "{:.2f}".format(probability) return speaker, probability, wav_file # Load Speaker Reco Model model = load_model('NEW_MODELS/CUR_speaker_model.h5') # Streamlit app st.title("Voice Correlation Recognition") st.write("This application is still undergoing fixes & updates ;-;") # Option to upload a file uploaded_file = st.file_uploader("Upload an audio file", type=['wav', 'mp3']) if uploaded_file is not None: with open("uploaded_audio.wav", "wb") as f: f.write(uploaded_file.getbuffer()) st.audio(uploaded_file, format='audio/wav') if st.button("Submit"): try: audio_to_spectrogram("uploaded_audio.wav") st.image("spectrogram.png", caption="Mel Spectrogram of the uploaded audio file", use_container_width=True) speaker, probability, _ = classify_speaker("uploaded_audio.wav") gender, gen_probability = classify_gender("uploaded_audio.wav") # What's the gender of speaker? st.write(f"Predicted Gender: {gender}") # What's the shot of speaker being a male or female st.write(f"Gender Probability: {gen_probability}") # Which speaker is it? st.write(f"Predicted Speaker: {speaker}") # What's the chances of being the speaker? st.write(f"Speaker Probability: {probability}") except Exception as e: st.error(f"Error occurred: {e}") # Record audio with streamlit_audio_recorder recorded_audio = st_audiorec() if recorded_audio: # Save the audio as a .wav file with open("recorded_audio.wav", "wb") as f: f.write(recorded_audio) st.write(f"Audio recorded and saved to recorded_audio.wav") # Show message st.audio("recorded_audio.wav") # Show the audio file # Process the recorded audio audio_to_spectrogram("recorded_audio.wav") st.image("spectrogram.png", caption="Mel Spectrogram of the uploaded audio file", use_container_width=True) # Classify the speaker and gender speaker, probability, wav_file = classify_speaker("recorded_audio.wav") gender, gen_probability = classify_gender("recorded_audio.wav") # Display results st.write(f"Predicted Gender: {gender}") st.write(f"Gender Probability: {gen_probability}") st.write(f"Predicted Speaker: {speaker}") st.write(f"Speaker Probability: {probability}") # Display the wav file of the predicted speaker st.audio(wav_file)