File size: 2,531 Bytes
d58f539
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import os
import base64
import gradio as gr
import openai
from pydub import AudioSegment
import io
import tempfile
import speech_recognition as sr

# Initialize the OpenAI client
client = openai.OpenAI(
    base_url="https://llama3-2-3b.lepton.run/api/v1/",
    api_key=os.environ.get('LEPTON_API_TOKEN')
)

def transcribe_audio(audio):
    # Convert the audio to wav format
    audio = AudioSegment.from_file(audio)
    audio = audio.set_frame_rate(16000).set_channels(1)
    
    # Save as wav file
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
        audio.export(temp_audio.name, format="wav")
        temp_audio_path = temp_audio.name

    # Perform speech recognition
    recognizer = sr.Recognizer()
    with sr.AudioFile(temp_audio_path) as source:
        audio_data = recognizer.record(source)
        text = recognizer.recognize_google(audio_data)

    # Clean up the temporary file
    os.unlink(temp_audio_path)

    return text

def process_audio(audio):
    # Transcribe the input audio
    transcription = transcribe_audio(audio)

    # Process the transcription with the API
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", "content": transcription},
        ],
        max_tokens=128,
        stream=True,
        extra_body={
            "require_audio": "true",
            "tts_preset_id": "jessica",
        }
    )

    response_text = ""
    audios = []

    for chunk in completion:
        if not chunk.choices:
            continue
        content = chunk.choices[0].delta.content
        audio = getattr(chunk.choices[0], 'audio', [])
        if content:
            response_text += content
        if audio:
            audios.extend(audio)

    # Combine audio chunks and save as MP3
    audio_data = b''.join([base64.b64decode(audio) for audio in audios])
    
    # Save the audio to a temporary file
    with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
        temp_audio.write(audio_data)
        temp_audio_path = temp_audio.name

    return response_text, temp_audio_path

# Create the Gradio interface
iface = gr.Interface(
    fn=process_audio,
    inputs=gr.Audio(type="filepath"),
    outputs=[
        gr.Textbox(label="Response Text"),
        gr.Audio(label="Response Audio")
    ],
    title="Audio-to-Audio Demo",
    description="Upload an audio file to get a response in both text and audio format."
)

# Launch the interface
iface.launch()