akhaliq HF staff commited on
Commit
f7de418
1 Parent(s): 669ae67

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -77
app.py CHANGED
@@ -1,59 +1,37 @@
1
- import base64
2
  import gradio as gr
3
- import openai
4
- from pydub import AudioSegment
5
  import io
 
6
  import tempfile
7
- import speech_recognition as sr
8
  import os
 
 
 
 
9
 
10
- def transcribe_audio(audio):
11
- try:
12
- # Convert the audio to wav format
13
- audio = AudioSegment.from_file(audio)
14
- audio = audio.set_frame_rate(16000).set_channels(1)
15
-
16
- # Save as wav file
17
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
18
- audio.export(temp_audio.name, format="wav")
19
- temp_audio_path = temp_audio.name
20
-
21
- # Perform speech recognition
22
- recognizer = sr.Recognizer()
23
- with sr.AudioFile(temp_audio_path) as source:
24
- audio_data = recognizer.record(source)
25
- text = recognizer.recognize_google(audio_data)
26
-
27
- return text
28
- except Exception as e:
29
- return f"Error in transcription: {str(e)}"
30
- finally:
31
- # Clean up the temporary file
32
- if 'temp_audio_path' in locals():
33
- os.unlink(temp_audio_path)
34
-
35
- def process_audio(audio, api_token):
36
- if not api_token:
37
- return "Please provide an API token.", None
38
 
39
- # Initialize the OpenAI client with the user-provided token
40
- client = openai.OpenAI(
41
- base_url="https://llama3-2-3b.lepton.run/api/v1/",
42
- api_key=api_token
43
- )
44
 
45
- # Transcribe the input audio
46
- transcription = transcribe_audio(audio)
47
- if transcription.startswith("Error in transcription:"):
48
- return transcription, None
49
 
50
- try:
51
- # Process the transcription with the API
 
 
52
  completion = client.chat.completions.create(
53
- model="gpt-3.5-turbo",
54
- messages=[
55
- {"role": "user", "content": transcription},
56
- ],
57
  max_tokens=128,
58
  stream=True,
59
  extra_body={
@@ -62,46 +40,66 @@ def process_audio(audio, api_token):
62
  }
63
  )
64
 
65
- response_text = ""
66
- audios = []
67
 
68
  for chunk in completion:
69
  if not chunk.choices:
70
  continue
 
71
  content = chunk.choices[0].delta.content
72
  audio = getattr(chunk.choices[0], 'audio', [])
 
73
  if content:
74
- response_text += content
 
 
75
  if audio:
76
- audios.extend(audio)
 
 
77
 
78
- # Combine audio chunks and save as MP3
79
- audio_data = b''.join([base64.b64decode(audio) for audio in audios])
80
-
81
- # Save the audio to a temporary file
82
- with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as temp_audio:
83
- temp_audio.write(audio_data)
84
- temp_audio_path = temp_audio.name
85
 
86
- return response_text, temp_audio_path
 
 
87
 
88
- except Exception as e:
89
- return f"An error occurred during API processing: {str(e)}", None
90
 
91
- # Create the Gradio interface
92
- iface = gr.Interface(
93
- fn=process_audio,
94
- inputs=[
95
- gr.Audio(type="filepath", label="Input Audio"),
96
- gr.Textbox(label="API Token", type="password")
97
- ],
98
- outputs=[
99
- gr.Textbox(label="Response Text"),
100
- gr.Audio(label="Response Audio")
101
- ],
102
- title="Audio-to-Audio Demo",
103
- description="Upload an audio file and provide your API token to get a response in both text and audio format."
104
- )
 
 
 
 
 
105
 
106
- # Launch the interface
107
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import numpy as np
 
3
  import io
4
+ from pydub import AudioSegment
5
  import tempfile
 
6
  import os
7
+ import base64
8
+ import openai
9
+ from dataclasses import dataclass, field
10
+ from threading import Lock
11
 
12
+ # Lepton API setup
13
+ client = openai.OpenAI(
14
+ base_url="https://llama3-1-8b.lepton.run/api/v1/",
15
+ api_key=os.environ.get('LEPTON_API_TOKEN')
16
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
+ @dataclass
19
+ class AppState:
20
+ conversation: list = field(default_factory=list)
21
+ lock: Lock = field(default_factory=Lock)
 
22
 
23
+ def transcribe_audio(audio):
24
+ # This is a placeholder function. In a real-world scenario, you'd use a
25
+ # speech-to-text service here. For now, we'll just return a dummy transcript.
26
+ return "This is a dummy transcript. Please implement actual speech-to-text functionality."
27
 
28
+ def generate_response_and_audio(message, state):
29
+ with state.lock:
30
+ state.conversation.append({"role": "user", "content": message})
31
+
32
  completion = client.chat.completions.create(
33
+ model="llama3-1-8b",
34
+ messages=state.conversation,
 
 
35
  max_tokens=128,
36
  stream=True,
37
  extra_body={
 
40
  }
41
  )
42
 
43
+ full_response = ""
44
+ audio_chunks = []
45
 
46
  for chunk in completion:
47
  if not chunk.choices:
48
  continue
49
+
50
  content = chunk.choices[0].delta.content
51
  audio = getattr(chunk.choices[0], 'audio', [])
52
+
53
  if content:
54
+ full_response += content
55
+ yield full_response, None, state
56
+
57
  if audio:
58
+ audio_chunks.extend(audio)
59
+ audio_data = b''.join([base64.b64decode(a) for a in audio_chunks])
60
+ yield full_response, audio_data, state
61
 
62
+ state.conversation.append({"role": "assistant", "content": full_response})
 
 
 
 
 
 
63
 
64
+ def chat(message, state):
65
+ if not message:
66
+ return "", None, state
67
 
68
+ return generate_response_and_audio(message, state)
 
69
 
70
+ def process_audio(audio, state):
71
+ if audio is None:
72
+ return "", state
73
+
74
+ # Convert numpy array to wav
75
+ audio_segment = AudioSegment(
76
+ audio[1].tobytes(),
77
+ frame_rate=audio[0],
78
+ sample_width=audio[1].dtype.itemsize,
79
+ channels=1 if len(audio[1].shape) == 1 else audio[1].shape[1]
80
+ )
81
+
82
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
83
+ audio_segment.export(temp_audio.name, format="wav")
84
+ transcript = transcribe_audio(temp_audio.name)
85
+
86
+ os.unlink(temp_audio.name)
87
+
88
+ return transcript, state
89
 
90
+ with gr.Blocks() as demo:
91
+ state = gr.State(AppState())
92
+
93
+ with gr.Row():
94
+ with gr.Column(scale=1):
95
+ audio_input = gr.Audio(source="microphone", type="numpy")
96
+ with gr.Column(scale=2):
97
+ chatbot = gr.Chatbot()
98
+ text_input = gr.Textbox(show_label=False, placeholder="Type your message here...")
99
+ with gr.Column(scale=1):
100
+ audio_output = gr.Audio(label="Generated Audio")
101
+
102
+ audio_input.change(process_audio, [audio_input, state], [text_input, state])
103
+ text_input.submit(chat, [text_input, state], [chatbot, audio_output, state])
104
+
105
+ demo.launch()