import gradio as gr import numpy as np import sounddevice as sd from transformers import pipeline # Load ASR model asr_model = "Abdullah17/whisper-small-urdu" asr_pipe = pipeline("automatic-speech-recognition", model=asr_model) # Function to transcribe the command from audio samples def transcribe_the_command(audio_samples): transcript = asr_pipe(np.array(audio_samples))[0]["text"] most_similar_command, reply = find_most_similar_command(transcript, commands) return f"Transcript: {transcript}\nMost Similar Command: {most_similar_command}" # Capture audio samples from the microphone def capture_audio(rec_duration=6, sample_rate=16000): audio_data = sd.rec(int(rec_duration * sample_rate), samplerate=sample_rate, channels=1) sd.wait() return audio_data.flatten() def find_most_similar_command(statement, command_list): best_match = None highest_similarity = 0 i=0 for command in command_list: similarity = SequenceMatcher(None, statement, command).ratio() print(similarity) if similarity > highest_similarity: highest_similarity = similarity best_match = command reply=i i+=1 return best_match,reply iface = gr.Interface( fn=transcribe_the_command, inputs=gr.inputs.Function(capture_audio, label="Recorded Audio"), outputs="text", title="Whisper Small Urdu Command", description="Realtime demo for Urdu speech recognition using a fine-tuned Whisper small model and outputting the estimated command on the basis of speech transcript.", ) iface.launch()