Update app.py
Browse files
app.py
CHANGED
@@ -12,13 +12,15 @@ pipe = pipeline(
|
|
12 |
translator = pipeline("text2text-generation", model="Baghdad99/saad-hausa-text-to-english-text")
|
13 |
tts = pipeline("text-to-speech", model="Baghdad99/english_voice_tts")
|
14 |
|
15 |
-
# Define the function to translate speech
|
16 |
def translate_speech(audio_data_tuple):
|
17 |
print(f"Type of audio: {type(audio_data_tuple)}, Value of audio: {audio_data_tuple}") # Debug line
|
18 |
|
19 |
# Extract the audio data from the tuple
|
20 |
sample_rate, audio_data = audio_data_tuple
|
21 |
|
|
|
|
|
|
|
22 |
# Use the speech recognition pipeline to transcribe the audio
|
23 |
output = pipe(audio_data)
|
24 |
print(f"Output: {output}") # Print the output to see what it contains
|
@@ -30,6 +32,9 @@ def translate_speech(audio_data_tuple):
|
|
30 |
print("The output does not contain 'text'")
|
31 |
return
|
32 |
|
|
|
|
|
|
|
33 |
# Use the translation pipeline to translate the transcription
|
34 |
translated_text = translator(transcription, return_tensors="pt")
|
35 |
print(f"Translated text: {translated_text}") # Print the translated text to see what it contains
|
@@ -42,6 +47,9 @@ def translate_speech(audio_data_tuple):
|
|
42 |
print("The translated text does not contain 'generated_token_ids'")
|
43 |
return
|
44 |
|
|
|
|
|
|
|
45 |
# Use the text-to-speech pipeline to synthesize the translated text
|
46 |
synthesised_speech = tts(translated_text_str)
|
47 |
print(f"Synthesised speech: {synthesised_speech}") # Print the synthesised speech to see what it contains
|
@@ -56,11 +64,15 @@ def translate_speech(audio_data_tuple):
|
|
56 |
# Flatten the audio data
|
57 |
synthesised_speech_data = synthesised_speech_data.flatten()
|
58 |
|
|
|
|
|
|
|
59 |
# Scale the audio data to the range of int16 format
|
60 |
synthesised_speech = (synthesised_speech_data * 32767).astype(np.int16)
|
61 |
|
62 |
return 16000, synthesised_speech
|
63 |
|
|
|
64 |
# Define the Gradio interface
|
65 |
iface = gr.Interface(
|
66 |
fn=translate_speech,
|
|
|
12 |
translator = pipeline("text2text-generation", model="Baghdad99/saad-hausa-text-to-english-text")
|
13 |
tts = pipeline("text-to-speech", model="Baghdad99/english_voice_tts")
|
14 |
|
|
|
15 |
def translate_speech(audio_data_tuple):
|
16 |
print(f"Type of audio: {type(audio_data_tuple)}, Value of audio: {audio_data_tuple}") # Debug line
|
17 |
|
18 |
# Extract the audio data from the tuple
|
19 |
sample_rate, audio_data = audio_data_tuple
|
20 |
|
21 |
+
# Print the shape and type of the audio data
|
22 |
+
print(f"Audio data type: {type(audio_data)}, Audio data shape: {audio_data.shape}")
|
23 |
+
|
24 |
# Use the speech recognition pipeline to transcribe the audio
|
25 |
output = pipe(audio_data)
|
26 |
print(f"Output: {output}") # Print the output to see what it contains
|
|
|
32 |
print("The output does not contain 'text'")
|
33 |
return
|
34 |
|
35 |
+
# Print the transcription
|
36 |
+
print(f"Transcription: {transcription}")
|
37 |
+
|
38 |
# Use the translation pipeline to translate the transcription
|
39 |
translated_text = translator(transcription, return_tensors="pt")
|
40 |
print(f"Translated text: {translated_text}") # Print the translated text to see what it contains
|
|
|
47 |
print("The translated text does not contain 'generated_token_ids'")
|
48 |
return
|
49 |
|
50 |
+
# Print the translated text string
|
51 |
+
print(f"Translated text string: {translated_text_str}")
|
52 |
+
|
53 |
# Use the text-to-speech pipeline to synthesize the translated text
|
54 |
synthesised_speech = tts(translated_text_str)
|
55 |
print(f"Synthesised speech: {synthesised_speech}") # Print the synthesised speech to see what it contains
|
|
|
64 |
# Flatten the audio data
|
65 |
synthesised_speech_data = synthesised_speech_data.flatten()
|
66 |
|
67 |
+
# Print the shape and type of the synthesised speech data
|
68 |
+
print(f"Synthesised speech data type: {type(synthesised_speech_data)}, Synthesised speech data shape: {synthesised_speech_data.shape}")
|
69 |
+
|
70 |
# Scale the audio data to the range of int16 format
|
71 |
synthesised_speech = (synthesised_speech_data * 32767).astype(np.int16)
|
72 |
|
73 |
return 16000, synthesised_speech
|
74 |
|
75 |
+
|
76 |
# Define the Gradio interface
|
77 |
iface = gr.Interface(
|
78 |
fn=translate_speech,
|