import gradio as gr import torch from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor import librosa # Load the model and processor model_name = "lnxdx/Wav2Vec2-Large-XLSR-Persian-ShEMO" processor = Wav2Vec2Processor.from_pretrained(model_name) model = Wav2Vec2ForCTC.from_pretrained(model_name) # Function to transcribe audio def transcribe(audio): # Load the audio audio, _ = librosa.load(audio, sr=16000) # Preprocess the audio input_values = processor(audio, return_tensors="pt", sampling_rate=16000).input_values # Perform inference with torch.no_grad(): logits = model(input_values).logits # Decode the logits to text predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.decode(predicted_ids[0]) return transcription # Create the Gradio interface iface = gr.Interface( fn=transcribe, inputs=gr.Audio(source="microphone", type="filepath"), outputs="text", title="Persian Speech Recognition", description="Transcribe Persian speech to text using Wav2Vec2-Large-XLSR-Persian-ShEMO model." ) # Launch the interface iface.launch()