Spaces:

keras-io
/

SpeakerRecognition

Runtime error

App Files Files Community

SpeakerRecognition / app.py

RobotJelly

app.py

dcdf570 over 2 years ago

raw

history blame contribute delete

3.96 kB

	#import os
	#import shutil
	import numpy as np

	import tensorflow as tf
	from tensorflow import keras

	#from pathlib import Path
	#from IPython.display import display, Audio

	import numpy as np
	import tensorflow as tf
	import gradio as gr
	from huggingface_hub import from_pretrained_keras
	#import cv2
	#from IPython.display import Audio

	classes_names = ['Benjamin_Netanyau', 'Jens_Stoltenberg', 'Julia_Gillard', 'Magaret_Tarcher', 'Nelson_Mandela']

	# Percentage of samples to use for validation
	# VALID_SPLIT = 0.1

	# Seed to use when shuffling the dataset and the noise
	# SHUFFLE_SEED = 43

	# The sampling rate to use.
	# This is the one used in all of the audio samples.
	# We will resample all of the noise to this sampling rate.
	# This will also be the output size of the audio wave samples
	# (since all samples are of 1 second long)
	SAMPLING_RATE = 16000

	# The factor to multiply the noise with according to:
	# noisy_sample = sample + noise * prop * scale
	# where prop = sample_amplitude / noise_amplitude
	# SCALE = 0.5

	# test_ds = paths_and_labels_to_dataset(valid_audio_paths, valid_labels)
	# test_ds = test_ds.shuffle(buffer_size=BATCH_SIZE * 8, seed=SHUFFLE_SEED).batch(
	# BATCH_SIZE
	# )

	# test_ds = test_ds.map(lambda x, y: (add_noise(x, noises, scale=SCALE), y))

	model = from_pretrained_keras("keras-io/speaker-recognition")


	def path_to_audio(path):
	"""Reads and decodes an audio file."""
	audio = tf.io.read_file(path)
	audio, _ = tf.audio.decode_wav(audio, 1, SAMPLING_RATE)
	return audio

	def audio_to_fft(audio):
	# Since tf.signal.fft applies FFT on the innermost dimension,
	# we need to squeeze the dimensions and then expand them again
	# after FFT
	audio = tf.squeeze(audio, axis=-1)
	fft = tf.signal.fft(
	tf.cast(tf.complex(real=audio, imag=tf.zeros_like(audio)), tf.complex64)
	)
	fft = tf.expand_dims(fft, axis=-1)
	# print("audio.shape[1]", audio.shape)

	# Return the absolute value of the first half of the FFT
	# which represents the positive frequencies
	return tf.math.abs(fft[:, : (audio.shape[1] // 2), :])


	#actual_audio_path = '/content/drive/MyDrive/Downloads/16000_pcm_speeches/audio/Benjamin_Netanyau/260.wav'

	# print(path_to_audio(actual_audio_path).shape)
	# print(actual_audio_path.shape)

	def predict(actual_audio_path, actual_label):
	path_of_actual_audio = path_to_audio(actual_audio_path)
	actual_audio = tf.expand_dims(path_of_actual_audio, axis=0)
	# Get the signal FFT
	ffts = audio_to_fft(actual_audio)
	# Predict
	y_pred = model.predict(ffts)
	y_pred = np.argmax(y_pred, axis=-1)
	# print(y_pred)
	return classes_names[y_pred[0]], actual_audio_path

	# the app takes one AUDIO to be recognised
	input = [gr.inputs.Audio(source="upload", type="filepath", label="Take audio sample"), gr.inputs.Textbox(label="Actual Speaker")]

	# the app outputs two segmented images
	output = [gr.outputs.Textbox(label="Predicted Speaker"), gr.outputs.Audio(label="Corresponding Audio")]
	# it's good practice to pass examples, description and a title to guide users
	examples = [['audios/260.wav', 'Benjamin_Netanyau'],
	['audios/611.wav', 'Jens_Stoltenberg'],
	['audios/65.wav', 'Julia_Gillard'],
	['audios/1083.wav', 'Magaret_Tarcher'],
	['audios/605.wav', 'Nelson_Mandela']]
	title = "Speaker Recognition"
	description = "Select the noisy audio samples from examples to check whether the speaker recognised by the model is correct or not even in presence of noise !!!"

	gr.Interface(fn=predict, inputs = input, outputs = output, examples=examples, live=True, allow_flagging=False, analytics_enabled=False,
	title=title, description=description, article="<center>Space By: <u><a href='https://github.com/robotjellyzone'><b>Kavya Bisht</b></a></u> \n Based on <a href='https://keras.io/examples/audio/speaker_recognition_using_cnn/'><b>this notebook</b></a></center>").launch(enable_queue=True, debug=True)