import gradio as gr
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import librosa

# Load the model and processor
model_name = "lnxdx/Wav2Vec2-Large-XLSR-Persian-ShEMO"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

# Function to transcribe audio
def transcribe(audio):
    # Load the audio
    audio, _ = librosa.load(audio, sr=16000)
    
    # Preprocess the audio
    input_values = processor(audio, return_tensors="pt", sampling_rate=16000).input_values
    
    # Perform inference
    with torch.no_grad():
        logits = model(input_values).logits
    
    # Decode the logits to text
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])
    
    return transcription

# Create the Gradio interface
iface = gr.Interface(
    fn=transcribe, 
    inputs=gr.Audio(source="microphone", type="filepath"), 
    outputs="text",
    title="Persian Speech Recognition",
    description="Transcribe Persian speech to text using Wav2Vec2-Large-XLSR-Persian-ShEMO model."
)

# Launch the interface
iface.launch()