File size: 3,524 Bytes
5b74a4b fcc244c 83e3ccb 22cb79a 9829b9c 2192037 5b74a4b fcc244c cca146f fcc244c 425531b fcc244c cca146f a927d1d 5b74a4b 393002d 4e03a52 9829b9c cca146f cd1000f cca146f 393002d c65895a 393002d 5b74a4b 72632b9 fcc244c c58bd88 fcc244c 17cfe18 2192037 a5ec736 c65895a 393002d b2c7d3a 5b74a4b 4e03a52 8fe6fd5 5b74a4b b2c7d3a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
import gradio as gr
import requests
import numpy as np
from pydub import AudioSegment
import io
from IPython.display import Audio
# Define the Hugging Face Inference API URLs and headers
ASR_API_URL = "https://api-inference.huggingface.co/models/Baghdad99/saad-speech-recognition-hausa-audio-to-text"
TTS_API_URL = "https://api-inference.huggingface.co/models/Baghdad99/english_voice_tts"
TRANSLATION_API_URL = "https://api-inference.huggingface.co/models/Baghdad99/saad-hausa-text-to-english-text"
headers = {"Authorization": "Bearer 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"}
# Define the function to query the Hugging Face Inference API
def query(api_url, payload=None, data=None):
if data is not None:
response = requests.post(api_url, headers=headers, data=data)
else:
response = requests.post(api_url, headers=headers, json=payload)
response_json = response.json()
if 'error' in response_json:
print(f"Error in query function: {response_json['error']}")
return None
return response_json
# Define the function to translate speech
def translate_speech(audio_file):
print(f"Type of audio: {type(audio_file)}, Value of audio: {audio_file}") # Debug line
# Use the ASR pipeline to transcribe the audio
data = audio_file.read()
output = query(ASR_API_URL, data=data)
print(f"Output: {output}") # Debug line
# Check if 'error' key exists in the output
if 'error' in output:
print(f"Error: {output['error']}")
estimated_time = output.get('estimated_time')
if estimated_time:
print(f"Estimated time for the model to load: {estimated_time} seconds")
return
# Check if 'text' key exists in the output
if 'text' in output:
transcription = output["text"]
else:
print("Key 'text' does not exist in the output.")
return
# Use the translation pipeline to translate the transcription
translated_text = query(TRANSLATION_API_URL, {"inputs": transcription})
# Use the TTS pipeline to synthesize the translated text
response = requests.post(TTS_API_URL, headers=headers, json={"inputs": translated_text})
audio_bytes = response.content
# Display the audio output
return Audio(audio_bytes)
# print(f"Type of audio: {type(audio_file)}, Value of audio: {audio_file}") # Debug line
# # Use the ASR pipeline to transcribe the audio
# data = audio_file.read()
# output = query(ASR_API_URL, data=data)
# print(f"Output: {output}") # Debug line
# # Check if 'text' key exists in the output
# if 'text' in output:
# transcription = output["text"]
# else:
# print("Key 'text' does not exist in the output.")
# return
# # Use the translation pipeline to translate the transcription
# translated_text = query(TRANSLATION_API_URL, {"inputs": transcription})
# # Use the TTS pipeline to synthesize the translated text
# response = requests.post(TTS_API_URL, headers=headers, json={"inputs": translated_text})
# audio_bytes = response.content
# # Display the audio output
# return Audio(audio_bytes)
# Define the Gradio interface
iface = gr.Interface(
fn=translate_speech,
inputs=gr.inputs.File(type="file"), # Change this line
outputs=gr.outputs.Audio(type="numpy"),
title="Hausa to English Translation",
description="Realtime demo for Hausa to English translation using speech recognition and text-to-speech synthesis."
)
iface.launch()
|