akhaliq HF staff commited on
Commit
913a139
1 Parent(s): e490755

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -31
app.py CHANGED
@@ -14,11 +14,11 @@ from threading import Lock
14
  class AppState:
15
  stream: np.ndarray | None = None
16
  sampling_rate: int = 0
17
- pause_detected: bool = False
18
- started_talking: bool = False
19
- stopped: bool = False
20
  conversation: list = field(default_factory=list)
21
  client: openai.OpenAI = None
 
22
 
23
  # Global lock for thread safety
24
  state_lock = Lock()
@@ -33,21 +33,30 @@ def process_audio(audio: tuple, state: AppState):
33
  if state.stream is None:
34
  state.stream = audio[1]
35
  state.sampling_rate = audio[0]
 
36
  else:
37
  state.stream = np.concatenate((state.stream, audio[1]))
38
 
39
- # Simple pause detection (you might want to implement a more sophisticated method)
40
- if len(state.stream) > state.sampling_rate * 0.5: # 0.5 second of silence
41
- state.pause_detected = True
 
 
 
 
 
 
 
42
  return gr.Audio(recording=False), state
 
43
  return None, state
44
 
45
  def generate_response_and_audio(audio_bytes: bytes, state: AppState):
46
  if state.client is None:
47
  raise gr.Error("Please enter a valid API key first.")
48
 
49
- format_ = "opus"
50
- bitrate = 16
51
  audio_data = base64.b64encode(audio_bytes).decode()
52
 
53
  try:
@@ -60,8 +69,8 @@ def generate_response_and_audio(audio_bytes: bytes, state: AppState):
60
  },
61
  model="llama3.1-8b",
62
  messages=[{"role": "user", "content": [{"type": "audio", "data": audio_data}]}],
63
- temperature=0.5,
64
- max_tokens=128,
65
  stream=True,
66
  )
67
 
@@ -88,8 +97,8 @@ def generate_response_and_audio(audio_bytes: bytes, state: AppState):
88
  raise gr.Error(f"Error during audio streaming: {e}")
89
 
90
  def response(state: AppState):
91
- if not state.pause_detected:
92
- return None, None, AppState()
93
 
94
  audio_buffer = io.BytesIO()
95
  segment = AudioSegment(
@@ -113,6 +122,11 @@ def response(state: AppState):
113
  # Update the chatbot with the final conversation
114
  chatbot_output = state.conversation[-2:] # Get the last two messages (user input and AI response)
115
 
 
 
 
 
 
116
  return chatbot_output, final_audio, state
117
 
118
  def set_api_key(api_key, state):
@@ -121,9 +135,9 @@ def set_api_key(api_key, state):
121
  state.client = create_client(api_key)
122
  return "API key set successfully!", state
123
 
124
- def start_recording_user(state: AppState):
125
- if not state.stopped:
126
- return gr.Audio(recording=True)
127
 
128
  with gr.Blocks() as demo:
129
  with gr.Row():
@@ -132,6 +146,9 @@ with gr.Blocks() as demo:
132
 
133
  api_key_status = gr.Textbox(label="API Key Status", interactive=False)
134
 
 
 
 
135
  with gr.Row():
136
  with gr.Column():
137
  input_audio = gr.Audio(label="Input Audio", sources="microphone", type="numpy")
@@ -142,13 +159,14 @@ with gr.Blocks() as demo:
142
  state = gr.State(AppState())
143
 
144
  set_key_button.click(set_api_key, inputs=[api_key_input, state], outputs=[api_key_status, state])
 
145
 
146
  stream = input_audio.stream(
147
  process_audio,
148
  [input_audio, state],
149
  [input_audio, state],
150
- stream_every=0.50,
151
- time_limit=30,
152
  )
153
 
154
  respond = input_audio.stop_recording(
@@ -157,18 +175,4 @@ with gr.Blocks() as demo:
157
  [chatbot, output_audio, state]
158
  )
159
 
160
- restart = output_audio.stop(
161
- start_recording_user,
162
- [state],
163
- [input_audio]
164
- )
165
-
166
- cancel = gr.Button("Stop Conversation", variant="stop")
167
- cancel.click(
168
- lambda: (AppState(stopped=True), gr.Audio(recording=False)),
169
- None,
170
- [state, input_audio],
171
- cancels=[respond, restart]
172
- )
173
-
174
  demo.launch()
 
14
  class AppState:
15
  stream: np.ndarray | None = None
16
  sampling_rate: int = 0
17
+ pause_start: float | None = None
18
+ last_speech: float = 0
 
19
  conversation: list = field(default_factory=list)
20
  client: openai.OpenAI = None
21
+ output_format: str = "mp3"
22
 
23
  # Global lock for thread safety
24
  state_lock = Lock()
 
33
  if state.stream is None:
34
  state.stream = audio[1]
35
  state.sampling_rate = audio[0]
36
+ state.last_speech = time.time()
37
  else:
38
  state.stream = np.concatenate((state.stream, audio[1]))
39
 
40
+ # Improved pause detection
41
+ current_time = time.time()
42
+ if np.max(np.abs(audio[1])) > 0.1: # Adjust this threshold as needed
43
+ state.last_speech = current_time
44
+ state.pause_start = None
45
+ elif state.pause_start is None:
46
+ state.pause_start = current_time
47
+
48
+ # Check if pause is long enough to stop recording
49
+ if state.pause_start and (current_time - state.pause_start > 2.0): # 2 seconds of silence
50
  return gr.Audio(recording=False), state
51
+
52
  return None, state
53
 
54
  def generate_response_and_audio(audio_bytes: bytes, state: AppState):
55
  if state.client is None:
56
  raise gr.Error("Please enter a valid API key first.")
57
 
58
+ format_ = state.output_format
59
+ bitrate = 128 if format_ == "mp3" else 32 # Higher bitrate for MP3, lower for OPUS
60
  audio_data = base64.b64encode(audio_bytes).decode()
61
 
62
  try:
 
69
  },
70
  model="llama3.1-8b",
71
  messages=[{"role": "user", "content": [{"type": "audio", "data": audio_data}]}],
72
+ temperature=0.7,
73
+ max_tokens=256,
74
  stream=True,
75
  )
76
 
 
97
  raise gr.Error(f"Error during audio streaming: {e}")
98
 
99
  def response(state: AppState):
100
+ if state.stream is None or len(state.stream) == 0:
101
+ return None, None, state
102
 
103
  audio_buffer = io.BytesIO()
104
  segment = AudioSegment(
 
122
  # Update the chatbot with the final conversation
123
  chatbot_output = state.conversation[-2:] # Get the last two messages (user input and AI response)
124
 
125
+ # Reset the audio stream for the next interaction
126
+ state.stream = None
127
+ state.pause_start = None
128
+ state.last_speech = 0
129
+
130
  return chatbot_output, final_audio, state
131
 
132
  def set_api_key(api_key, state):
 
135
  state.client = create_client(api_key)
136
  return "API key set successfully!", state
137
 
138
+ def update_format(format, state):
139
+ state.output_format = format
140
+ return state
141
 
142
  with gr.Blocks() as demo:
143
  with gr.Row():
 
146
 
147
  api_key_status = gr.Textbox(label="API Key Status", interactive=False)
148
 
149
+ with gr.Row():
150
+ format_dropdown = gr.Dropdown(choices=["mp3", "opus"], value="mp3", label="Output Audio Format")
151
+
152
  with gr.Row():
153
  with gr.Column():
154
  input_audio = gr.Audio(label="Input Audio", sources="microphone", type="numpy")
 
159
  state = gr.State(AppState())
160
 
161
  set_key_button.click(set_api_key, inputs=[api_key_input, state], outputs=[api_key_status, state])
162
+ format_dropdown.change(update_format, inputs=[format_dropdown, state], outputs=[state])
163
 
164
  stream = input_audio.stream(
165
  process_audio,
166
  [input_audio, state],
167
  [input_audio, state],
168
+ stream_every=0.25, # Reduced to make it more responsive
169
+ time_limit=60, # Increased to allow for longer messages
170
  )
171
 
172
  respond = input_audio.stop_recording(
 
175
  [chatbot, output_audio, state]
176
  )
177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  demo.launch()