akhaliq's picture
akhaliq HF staff
Update app.py
16020a5 verified
raw
history blame
2.97 kB
import base64
import gradio as gr
import openai
from pydub import AudioSegment
import io
import tempfile
import speech_recognition as sr
def transcribe_audio(audio):
# Convert the audio to wav format
audio = AudioSegment.from_file(audio)
audio = audio.set_frame_rate(16000).set_channels(1)
# Save as wav file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
audio.export(temp_audio.name, format="wav")
temp_audio_path = temp_audio.name
# Perform speech recognition
recognizer = sr.Recognizer()
with sr.AudioFile(temp_audio_path) as source:
audio_data = recognizer.record(source)
text = recognizer.recognize_google(audio_data)
# Clean up the temporary file
os.unlink(temp_audio_path)
return text
def process_audio(audio, api_token):
if not api_token:
return "Please provide an API token.", None
# Initialize the OpenAI client with the user-provided token
client = openai.OpenAI(
base_url="https://llama3-2-3b.lepton.run/api/v1/",
api_key=api_token
)
# Transcribe the input audio
transcription = transcribe_audio(audio)
try:
# Process the transcription with the API
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": transcription},
],
max_tokens=128,
stream=True,
extra_body={
"require_audio": "true",
"tts_preset_id": "jessica",
}
)
response_text = ""
audios = []
for chunk in completion:
if not chunk.choices:
continue
content = chunk.choices[0].delta.content
audio = getattr(chunk.choices[0], 'audio', [])
if content:
response_text += content
if audio:
audios.extend(audio)
# Combine audio chunks and save as MP3
audio_data = b''.join([base64.b64decode(audio) for audio in audios])
# Save the audio to a temporary file
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
temp_audio.write(audio_data)
temp_audio_path = temp_audio.name
return response_text, temp_audio_path
except Exception as e:
return f"An error occurred: {str(e)}", None
# Create the Gradio interface
iface = gr.Interface(
fn=process_audio,
inputs=[
gr.Audio(type="filepath", label="Input Audio"),
gr.Textbox(label="API Token", type="password")
],
outputs=[
gr.Textbox(label="Response Text"),
gr.Audio(label="Response Audio")
],
title="Audio-to-Audio Demo",
description="Upload an audio file and provide your API token to get a response in both text and audio format."
)
# Launch the interface
iface.launch()