llama-3.2-3b-voice

Runtime error

App Files Files Community

akhaliq HF staff commited on Sep 27

Commit

f8bd65e

•

1 Parent(s): c75877d

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -76

app.py CHANGED Viewed

@@ -12,6 +12,11 @@ from threading import Lock
 @dataclass
 class AppState:
     conversation: list = field(default_factory=list)
     client: openai.OpenAI = None
@@ -24,17 +29,27 @@ def create_client(api_key):
         api_key=api_key
     )
-def process_audio_file(audio_file, state):
     if state.client is None:
         raise gr.Error("Please enter a valid API key first.")
     format_ = "opus"
     bitrate = 16
-    with open(audio_file.name, "rb") as f:
-        audio_bytes = f.read()
     audio_data = base64.b64encode(audio_bytes).decode()
     try:
         stream = state.client.chat.completions.create(
             extra_body={
@@ -50,69 +65,42 @@ def process_audio_file(audio_file, state):
             stream=True,
         )
-        transcript = ""
-        audio_chunks = []
         for chunk in stream:
-            if chunk.choices[0].delta.content:
-                transcript += chunk.choices[0].delta.content
-            if hasattr(chunk.choices[0], 'audio') and chunk.choices[0].audio:
-                audio_chunks.extend(chunk.choices[0].audio)
-        audio_data = b''.join([base64.b64decode(a) for a in audio_chunks])
-        return transcript, audio_data, state
     except Exception as e:
-        raise gr.Error(f"Error processing audio: {str(e)}")
-def generate_response_and_audio(message, state):
-    if state.client is None:
-        raise gr.Error("Please enter a valid API key first.")
-    with state_lock:
-        state.conversation.append({"role": "user", "content": message})
-        try:
-            completion = state.client.chat.completions.create(
-                model="llama3-1-8b",
-                messages=state.conversation,
-                max_tokens=128,
-                stream=True,
-                extra_body={
-                    "require_audio": "true",
-                    "tts_preset_id": "jessica",
-                }
-            )
-            full_response = ""
-            audio_chunks = []
-            for chunk in completion:
-                if not chunk.choices:
-                    continue
-                content = chunk.choices[0].delta.content
-                audio = getattr(chunk.choices[0], 'audio', [])
-                if content:
-                    full_response += content
-                    yield full_response, None, state
-                if audio:
-                    audio_chunks.extend(audio)
-                    audio_data = b''.join([base64.b64decode(a) for a in audio_chunks])
-                    yield full_response, audio_data, state
-            state.conversation.append({"role": "assistant", "content": full_response})
-        except Exception as e:
-            raise gr.Error(f"Error generating response: {str(e)}")
-def chat(message, state):
-    if not message:
-        return "", None, state
-    return generate_response_and_audio(message, state)
 def set_api_key(api_key, state):
     if not api_key:
@@ -120,9 +108,11 @@ def set_api_key(api_key, state):
     state.client = create_client(api_key)
     return "API key set successfully!", state
 with gr.Blocks() as demo:
-    state = gr.State(AppState())
     with gr.Row():
         api_key_input = gr.Textbox(type="password", label="Enter your Lepton API Key")
         set_key_button = gr.Button("Set API Key")
@@ -130,20 +120,42 @@ with gr.Blocks() as demo:
     api_key_status = gr.Textbox(label="API Key Status", interactive=False)
     with gr.Row():
-        with gr.Column(scale=1):
-            audio_file_input = gr.Audio(sources="microphone")
-        with gr.Column(scale=2):
-            chatbot = gr.Chatbot()
-            text_input = gr.Textbox(show_label=False, placeholder="Type your message here...")
-        with gr.Column(scale=1):
-            audio_output = gr.Audio(label="Generated Audio")
     set_key_button.click(set_api_key, inputs=[api_key_input, state], outputs=[api_key_status, state])
-    audio_file_input.change(
-        process_audio_file,
-        inputs=[audio_file_input, state],
-        outputs=[text_input, audio_output, state]
     )
-    text_input.submit(chat, inputs=[text_input, state], outputs=[chatbot, audio_output, state])
 demo.launch()

 @dataclass
 class AppState:
+    stream: np.ndarray | None = None
+    sampling_rate: int = 0
+    pause_detected: bool = False
+    started_talking: bool = False
+    stopped: bool = False
     conversation: list = field(default_factory=list)
     client: openai.OpenAI = None
         api_key=api_key
     )
+def process_audio(audio: tuple, state: AppState):
+    if state.stream is None:
+        state.stream = audio[1]
+        state.sampling_rate = audio[0]
+    else:
+        state.stream = np.concatenate((state.stream, audio[1]))
+    # Simple pause detection (you might want to implement a more sophisticated method)
+    if len(state.stream) > state.sampling_rate * 0.5:  # 0.5 second of silence
+        state.pause_detected = True
+        return gr.Audio(recording=False), state
+    return None, state
+def generate_response_and_audio(audio_bytes: bytes, state: AppState):
     if state.client is None:
         raise gr.Error("Please enter a valid API key first.")
     format_ = "opus"
     bitrate = 16
     audio_data = base64.b64encode(audio_bytes).decode()
     try:
         stream = state.client.chat.completions.create(
             extra_body={
             stream=True,
         )
+        full_response = ""
+        audios = []
         for chunk in stream:
+            if not chunk.choices:
+                continue
+            content = chunk.choices[0].delta.content
+            audio = getattr(chunk.choices[0], 'audio', [])
+            if content:
+                full_response += content
+                yield full_response, None, state
+            if audio:
+                audios.extend(audio)
+                audio_data = b''.join([base64.b64decode(a) for a in audios])
+                yield full_response, audio_data, state
+        state.conversation.append({"role": "user", "content": "Audio input"})
+        state.conversation.append({"role": "assistant", "content": full_response})
     except Exception as e:
+        raise gr.Error(f"Error during audio streaming: {e}")
+def response(state: AppState):
+    if not state.pause_detected:
+        return None, None, AppState()
+    audio_buffer = io.BytesIO()
+    segment = AudioSegment(
+        state.stream.tobytes(),
+        frame_rate=state.sampling_rate,
+        sample_width=state.stream.dtype.itemsize,
+        channels=(1 if len(state.stream.shape) == 1 else state.stream.shape[1]),
+    )
+    segment.export(audio_buffer, format="wav")
+    return generate_response_and_audio(audio_buffer.getvalue(), state)
 def set_api_key(api_key, state):
     if not api_key:
     state.client = create_client(api_key)
     return "API key set successfully!", state
+def start_recording_user(state: AppState):
+    if not state.stopped:
+        return gr.Audio(recording=True)
 with gr.Blocks() as demo:
     with gr.Row():
         api_key_input = gr.Textbox(type="password", label="Enter your Lepton API Key")
         set_key_button = gr.Button("Set API Key")
     api_key_status = gr.Textbox(label="API Key Status", interactive=False)
     with gr.Row():
+        with gr.Column():
+            input_audio = gr.Audio(label="Input Audio", sources="microphone", type="numpy")
+        with gr.Column():
+            chatbot = gr.Chatbot(label="Conversation", type="messages")
+            output_audio = gr.Audio(label="Output Audio", streaming=True, autoplay=True)
+    state = gr.State(AppState())
     set_key_button.click(set_api_key, inputs=[api_key_input, state], outputs=[api_key_status, state])
+    stream = input_audio.stream(
+        process_audio,
+        [input_audio, state],
+        [input_audio, state],
+        stream_every=0.50,
+        time_limit=30,
     )
+    respond = input_audio.stop_recording(
+        response,
+        [state],
+        [chatbot, output_audio, state]
+    )
+    restart = output_audio.stop(
+        start_recording_user,
+        [state],
+        [input_audio]
+    )
+    cancel = gr.Button("Stop Conversation", variant="stop")
+    cancel.click(
+        lambda: (AppState(stopped=True), gr.Audio(recording=False)),
+        None,
+        [state, input_audio],
+        cancels=[respond, restart]
+    )
 demo.launch()