Spaces:
Running
Running
File size: 2,531 Bytes
d58f539 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import os
import base64
import gradio as gr
import openai
from pydub import AudioSegment
import io
import tempfile
import speech_recognition as sr
# Initialize the OpenAI client
client = openai.OpenAI(
base_url="https://llama3-2-3b.lepton.run/api/v1/",
api_key=os.environ.get('LEPTON_API_TOKEN')
)
def transcribe_audio(audio):
# Convert the audio to wav format
audio = AudioSegment.from_file(audio)
audio = audio.set_frame_rate(16000).set_channels(1)
# Save as wav file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
audio.export(temp_audio.name, format="wav")
temp_audio_path = temp_audio.name
# Perform speech recognition
recognizer = sr.Recognizer()
with sr.AudioFile(temp_audio_path) as source:
audio_data = recognizer.record(source)
text = recognizer.recognize_google(audio_data)
# Clean up the temporary file
os.unlink(temp_audio_path)
return text
def process_audio(audio):
# Transcribe the input audio
transcription = transcribe_audio(audio)
# Process the transcription with the API
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": transcription},
],
max_tokens=128,
stream=True,
extra_body={
"require_audio": "true",
"tts_preset_id": "jessica",
}
)
response_text = ""
audios = []
for chunk in completion:
if not chunk.choices:
continue
content = chunk.choices[0].delta.content
audio = getattr(chunk.choices[0], 'audio', [])
if content:
response_text += content
if audio:
audios.extend(audio)
# Combine audio chunks and save as MP3
audio_data = b''.join([base64.b64decode(audio) for audio in audios])
# Save the audio to a temporary file
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
temp_audio.write(audio_data)
temp_audio_path = temp_audio.name
return response_text, temp_audio_path
# Create the Gradio interface
iface = gr.Interface(
fn=process_audio,
inputs=gr.Audio(type="filepath"),
outputs=[
gr.Textbox(label="Response Text"),
gr.Audio(label="Response Audio")
],
title="Audio-to-Audio Demo",
description="Upload an audio file to get a response in both text and audio format."
)
# Launch the interface
iface.launch() |