Spaces:
Running
Running
File size: 3,304 Bytes
d58f539 669ae67 d58f539 669ae67 d58f539 669ae67 d58f539 669ae67 d58f539 16020a5 d58f539 669ae67 d58f539 16020a5 d58f539 16020a5 d58f539 16020a5 d58f539 16020a5 d58f539 16020a5 669ae67 d58f539 16020a5 d58f539 16020a5 d58f539 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import base64
import gradio as gr
import openai
from pydub import AudioSegment
import io
import tempfile
import speech_recognition as sr
import os
def transcribe_audio(audio):
try:
# Convert the audio to wav format
audio = AudioSegment.from_file(audio)
audio = audio.set_frame_rate(16000).set_channels(1)
# Save as wav file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
audio.export(temp_audio.name, format="wav")
temp_audio_path = temp_audio.name
# Perform speech recognition
recognizer = sr.Recognizer()
with sr.AudioFile(temp_audio_path) as source:
audio_data = recognizer.record(source)
text = recognizer.recognize_google(audio_data)
return text
except Exception as e:
return f"Error in transcription: {str(e)}"
finally:
# Clean up the temporary file
if 'temp_audio_path' in locals():
os.unlink(temp_audio_path)
def process_audio(audio, api_token):
if not api_token:
return "Please provide an API token.", None
# Initialize the OpenAI client with the user-provided token
client = openai.OpenAI(
base_url="https://llama3-2-3b.lepton.run/api/v1/",
api_key=api_token
)
# Transcribe the input audio
transcription = transcribe_audio(audio)
if transcription.startswith("Error in transcription:"):
return transcription, None
try:
# Process the transcription with the API
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": transcription},
],
max_tokens=128,
stream=True,
extra_body={
"require_audio": "true",
"tts_preset_id": "jessica",
}
)
response_text = ""
audios = []
for chunk in completion:
if not chunk.choices:
continue
content = chunk.choices[0].delta.content
audio = getattr(chunk.choices[0], 'audio', [])
if content:
response_text += content
if audio:
audios.extend(audio)
# Combine audio chunks and save as MP3
audio_data = b''.join([base64.b64decode(audio) for audio in audios])
# Save the audio to a temporary file
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
temp_audio.write(audio_data)
temp_audio_path = temp_audio.name
return response_text, temp_audio_path
except Exception as e:
return f"An error occurred during API processing: {str(e)}", None
# Create the Gradio interface
iface = gr.Interface(
fn=process_audio,
inputs=[
gr.Audio(type="filepath", label="Input Audio"),
gr.Textbox(label="API Token", type="password")
],
outputs=[
gr.Textbox(label="Response Text"),
gr.Audio(label="Response Audio")
],
title="Audio-to-Audio Demo",
description="Upload an audio file and provide your API token to get a response in both text and audio format."
)
# Launch the interface
iface.launch() |