llama-3.2-3b-voice

Runtime error

App Files Files Community

yadongxie commited on Sep 27

Commit

120d632

•

1 Parent(s): 1926927

fix: add asr result back

Browse files

Files changed (1) hide show

app.py +14 -5

app.py CHANGED Viewed

@@ -2,7 +2,9 @@ import gradio as gr
 import numpy as np
 import io
 from pydub import AudioSegment
 import openai
 from dataclasses import dataclass, field
 from threading import Lock
 import base64
@@ -88,6 +90,7 @@ def generate_response_and_audio(audio_bytes: bytes, state: AppState):
         )
         full_response = ""
         audios = []
         for chunk in stream:
@@ -95,15 +98,19 @@ def generate_response_and_audio(audio_bytes: bytes, state: AppState):
                 continue
             content = chunk.choices[0].delta.content
             audio = getattr(chunk.choices[0], "audio", [])
             if content:
                 full_response += content
-                yield full_response, None, state
             if audio:
                 audios.extend(audio)
         final_audio = b"".join([base64.b64decode(a) for a in audios])
-        yield full_response, final_audio, state
     except Exception as e:
         raise gr.Error(f"Error during audio streaming: {e}")
@@ -126,14 +133,16 @@ def response(state: AppState):
     # Process the generator to get the final results
     final_text = ""
     final_audio = None
-    for text, audio, updated_state in generator:
         final_text = text if text else final_text
         final_audio = audio if audio else final_audio
         state = updated_state
     # Update the chatbot with the final conversation
-    state.conversation.append({"role": "user", "content": "Audio input"})
     state.conversation.append({"role": "assistant", "content": final_text})
     # Reset the audio stream for the next interaction
@@ -218,4 +227,4 @@ with gr.Blocks() as demo:
         cancels=[respond, restart],
     )
-demo.launch(share=True)

 import numpy as np
 import io
 from pydub import AudioSegment
+import tempfile
 import openai
+import time
 from dataclasses import dataclass, field
 from threading import Lock
 import base64
         )
         full_response = ""
+        asr_result = ""
         audios = []
         for chunk in stream:
                 continue
             content = chunk.choices[0].delta.content
             audio = getattr(chunk.choices[0], "audio", [])
+            asr_results = getattr(chunk.choices[0], "asr_results", [])
+            if asr_results:
+                asr_result += "".join(asr_results)
+                yield full_response, asr_result, None, state
             if content:
                 full_response += content
+                yield full_response, asr_result, None, state
             if audio:
                 audios.extend(audio)
         final_audio = b"".join([base64.b64decode(a) for a in audios])
+        yield full_response, asr_result, final_audio, state
     except Exception as e:
         raise gr.Error(f"Error during audio streaming: {e}")
     # Process the generator to get the final results
     final_text = ""
+    final_asr = ""
     final_audio = None
+    for text, asr, audio, updated_state in generator:
         final_text = text if text else final_text
+        final_asr = asr if asr else final_asr
         final_audio = audio if audio else final_audio
         state = updated_state
     # Update the chatbot with the final conversation
+    state.conversation.append({"role": "user", "content": final_asr})
     state.conversation.append({"role": "assistant", "content": final_text})
     # Reset the audio stream for the next interaction
         cancels=[respond, restart],
     )
+demo.launch()