import torch
from transformers import pipeline
import gradio as gr
import whisper

# Load the Whisper model for transcription
whisper_model = whisper.load_model("base")

# Load the emotion recognition pipeline
emotion_recognition = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None)

# Function to transcribe audio
def transcribe_audio(audio_file):
    result = whisper_model.transcribe(audio_file)
    return result["text"]

# Function to transcribe audio and recognize emotions
def transcribe_and_recognize_emotions(audio_file):
    # Transcribe audio
    transcription = transcribe_audio(audio_file)

    # Recognize emotions of the transcribed text
    emotions = emotion_recognition(transcription)

    # Extract the emotion with the highest score
    dominant_emotion = max(emotions[0], key=lambda x: x['score'])['label']

    return transcription, dominant_emotion

# Define the Gradio interface function
def gradio_transcription_emotion_interface(audio):
    transcription, emotion = transcribe_and_recognize_emotions(audio)
    return transcription, emotion

# Set up Gradio Interface
iface = gr.Interface(
    fn=gradio_transcription_emotion_interface,
    inputs=gr.Audio(type="filepath"),
    outputs=[
        gr.Textbox(label="Transcription"),
        gr.Label(label="Dominant Emotion")
    ],
    title="Audio Transcription and Emotion Recognition",
    description="Upload or record an audio file to get the transcription and recognize its dominant emotion."
)

# Deploy the interface
iface.launch(debug=True)