akhaliq's picture
akhaliq HF staff
Update app.py
669ae67 verified
raw
history blame
3.3 kB
import base64
import gradio as gr
import openai
from pydub import AudioSegment
import io
import tempfile
import speech_recognition as sr
import os
def transcribe_audio(audio):
try:
# Convert the audio to wav format
audio = AudioSegment.from_file(audio)
audio = audio.set_frame_rate(16000).set_channels(1)
# Save as wav file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
audio.export(temp_audio.name, format="wav")
temp_audio_path = temp_audio.name
# Perform speech recognition
recognizer = sr.Recognizer()
with sr.AudioFile(temp_audio_path) as source:
audio_data = recognizer.record(source)
text = recognizer.recognize_google(audio_data)
return text
except Exception as e:
return f"Error in transcription: {str(e)}"
finally:
# Clean up the temporary file
if 'temp_audio_path' in locals():
os.unlink(temp_audio_path)
def process_audio(audio, api_token):
if not api_token:
return "Please provide an API token.", None
# Initialize the OpenAI client with the user-provided token
client = openai.OpenAI(
base_url="https://llama3-2-3b.lepton.run/api/v1/",
api_key=api_token
)
# Transcribe the input audio
transcription = transcribe_audio(audio)
if transcription.startswith("Error in transcription:"):
return transcription, None
try:
# Process the transcription with the API
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": transcription},
],
max_tokens=128,
stream=True,
extra_body={
"require_audio": "true",
"tts_preset_id": "jessica",
}
)
response_text = ""
audios = []
for chunk in completion:
if not chunk.choices:
continue
content = chunk.choices[0].delta.content
audio = getattr(chunk.choices[0], 'audio', [])
if content:
response_text += content
if audio:
audios.extend(audio)
# Combine audio chunks and save as MP3
audio_data = b''.join([base64.b64decode(audio) for audio in audios])
# Save the audio to a temporary file
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
temp_audio.write(audio_data)
temp_audio_path = temp_audio.name
return response_text, temp_audio_path
except Exception as e:
return f"An error occurred during API processing: {str(e)}", None
# Create the Gradio interface
iface = gr.Interface(
fn=process_audio,
inputs=[
gr.Audio(type="filepath", label="Input Audio"),
gr.Textbox(label="API Token", type="password")
],
outputs=[
gr.Textbox(label="Response Text"),
gr.Audio(label="Response Audio")
],
title="Audio-to-Audio Demo",
description="Upload an audio file and provide your API token to get a response in both text and audio format."
)
# Launch the interface
iface.launch()