import torch from transformers import pipeline import gradio as gr import whisper # Load the Whisper model for transcription whisper_model = whisper.load_model("base") # Load the emotion recognition pipeline emotion_recognition = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None) # Function to transcribe audio def transcribe_audio(audio_file): result = whisper_model.transcribe(audio_file) return result["text"] # Function to transcribe audio and recognize emotions def transcribe_and_recognize_emotions(audio_file): # Transcribe audio transcription = transcribe_audio(audio_file) # Recognize emotions of the transcribed text emotions = emotion_recognition(transcription) # Extract the emotion with the highest score dominant_emotion = max(emotions[0], key=lambda x: x['score'])['label'] return transcription, dominant_emotion # Define the Gradio interface function def gradio_transcription_emotion_interface(audio): transcription, emotion = transcribe_and_recognize_emotions(audio) return transcription, emotion # Set up Gradio Interface iface = gr.Interface( fn=gradio_transcription_emotion_interface, inputs=gr.Audio(type="filepath"), outputs=[ gr.Textbox(label="Transcription"), gr.Label(label="Dominant Emotion") ], title="Audio Transcription and Emotion Recognition", description="Upload or record an audio file to get the transcription and recognize its dominant emotion." ) # Deploy the interface iface.launch(debug=True)