akhaliq HF staff commited on
Commit
d58f539
1 Parent(s): 09596a0

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -0
app.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import base64
3
+ import gradio as gr
4
+ import openai
5
+ from pydub import AudioSegment
6
+ import io
7
+ import tempfile
8
+ import speech_recognition as sr
9
+
10
+ # Initialize the OpenAI client
11
+ client = openai.OpenAI(
12
+ base_url="https://llama3-2-3b.lepton.run/api/v1/",
13
+ api_key=os.environ.get('LEPTON_API_TOKEN')
14
+ )
15
+
16
+ def transcribe_audio(audio):
17
+ # Convert the audio to wav format
18
+ audio = AudioSegment.from_file(audio)
19
+ audio = audio.set_frame_rate(16000).set_channels(1)
20
+
21
+ # Save as wav file
22
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
23
+ audio.export(temp_audio.name, format="wav")
24
+ temp_audio_path = temp_audio.name
25
+
26
+ # Perform speech recognition
27
+ recognizer = sr.Recognizer()
28
+ with sr.AudioFile(temp_audio_path) as source:
29
+ audio_data = recognizer.record(source)
30
+ text = recognizer.recognize_google(audio_data)
31
+
32
+ # Clean up the temporary file
33
+ os.unlink(temp_audio_path)
34
+
35
+ return text
36
+
37
+ def process_audio(audio):
38
+ # Transcribe the input audio
39
+ transcription = transcribe_audio(audio)
40
+
41
+ # Process the transcription with the API
42
+ completion = client.chat.completions.create(
43
+ model="gpt-3.5-turbo",
44
+ messages=[
45
+ {"role": "user", "content": transcription},
46
+ ],
47
+ max_tokens=128,
48
+ stream=True,
49
+ extra_body={
50
+ "require_audio": "true",
51
+ "tts_preset_id": "jessica",
52
+ }
53
+ )
54
+
55
+ response_text = ""
56
+ audios = []
57
+
58
+ for chunk in completion:
59
+ if not chunk.choices:
60
+ continue
61
+ content = chunk.choices[0].delta.content
62
+ audio = getattr(chunk.choices[0], 'audio', [])
63
+ if content:
64
+ response_text += content
65
+ if audio:
66
+ audios.extend(audio)
67
+
68
+ # Combine audio chunks and save as MP3
69
+ audio_data = b''.join([base64.b64decode(audio) for audio in audios])
70
+
71
+ # Save the audio to a temporary file
72
+ with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
73
+ temp_audio.write(audio_data)
74
+ temp_audio_path = temp_audio.name
75
+
76
+ return response_text, temp_audio_path
77
+
78
+ # Create the Gradio interface
79
+ iface = gr.Interface(
80
+ fn=process_audio,
81
+ inputs=gr.Audio(type="filepath"),
82
+ outputs=[
83
+ gr.Textbox(label="Response Text"),
84
+ gr.Audio(label="Response Audio")
85
+ ],
86
+ title="Audio-to-Audio Demo",
87
+ description="Upload an audio file to get a response in both text and audio format."
88
+ )
89
+
90
+ # Launch the interface
91
+ iface.launch()