whisperaudio / app.py
Hunzla's picture
Update app.py
83d70c9
raw
history blame
2.45 kB
from transformers import pipeline
import soundfile as sf
# Load ASR model
asr_model = "Abdullah17/whisper-small-urdu"
asr_pipe = pipeline("automatic-speech-recognition", model=asr_model)
# Rest of your code
commands = [
"نمائندے ایجنٹ نمائندہ",
"سم ایکٹیویٹ",
"سم بلاک بند",
"موبائل پیکیجز انٹرنیٹ پیکیج",
"چالان جمع",
"گانا سنانا"
]
# Function to transcribe the command from audio
def transcribe_the_command(audio_list):
transcriptions = []
# Process each audio in the batch
for audio_data, sample_rate in audio_list:
file_name = "recorded_audio.wav"
sf.write(file_name, audio_data, sample_rate)
# Convert stereo to mono by averaging the two channels
transcript = asr_pipe(file_name)[0]["text"]
most_similar_command, reply = find_most_similar_command(transcript, commands)
transcriptions.append((transcript, most_similar_command, reply))
return transcriptions
# from transformers import pipeline
# asr_pipe = pipeline("automatic-speech-recognition", model="Abdullah17/whisper-small-urdu")
# from difflib import SequenceMatcher
# # List of commands
# commands = [
# "نمائندے ایجنٹ نمائندہ",
# " سم ایکٹیویٹ ",
# " سم بلاک بند ",
# "موبائل پیکیجز انٹرنیٹ پیکیج",
# " چالان جمع ",
# " گانا سنانا"
# ]
# # replies = [
# # 1,2,
# # ]
# # Function to find the most similar command
def find_most_similar_command(statement, command_list):
best_match = None
highest_similarity = 0
i=0
for command in command_list:
similarity = SequenceMatcher(None, statement, command).ratio()
print(similarity)
if similarity > highest_similarity:
highest_similarity = similarity
best_match = command
reply=i
i+=1
return best_match,reply
# x
# get_text_from_voice("urdu.wav")
import gradio as gr
iface = gr.Interface(
fn=transcribe_the_command,
inputs=gr.inputs.Audio(label="Recorded Audio",source="microphone", duration=6),
outputs="text",
title="Whisper Small Urdu Command",
description="Realtime demo for Urdu speech recognition using a fine-tuned Whisper small model and outputting the estimated command on the basis of speech transcript.",
)
iface.launch()