Spaces:

Baghdad99
/

ha-en

Sleeping

File size: 3,524 Bytes

5b74a4b
fcc244c
83e3ccb
22cb79a
9829b9c
2192037
5b74a4b
fcc244c
cca146f
fcc244c
 
 
425531b
fcc244c
cca146f
 
 
 
 
a927d1d
 
 
 
 
 
5b74a4b
393002d
4e03a52
 
9829b9c
cca146f
cd1000f
cca146f
393002d
 
c65895a
 
 
 
 
 
 
 
393002d
 
 
 
 
 
5b74a4b
72632b9
fcc244c
c58bd88
fcc244c
 
 
17cfe18
2192037
 
a5ec736
c65895a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393002d
b2c7d3a
5b74a4b
 
4e03a52
8fe6fd5
5b74a4b
 
 
 
b2c7d3a

import gradio as gr
import requests
import numpy as np
from pydub import AudioSegment
import io
from IPython.display import Audio

# Define the Hugging Face Inference API URLs and headers
ASR_API_URL = "https://api-inference.huggingface.co/models/Baghdad99/saad-speech-recognition-hausa-audio-to-text"
TTS_API_URL = "https://api-inference.huggingface.co/models/Baghdad99/english_voice_tts"
TRANSLATION_API_URL = "https://api-inference.huggingface.co/models/Baghdad99/saad-hausa-text-to-english-text"
headers = {"Authorization": "Bearer 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"}

# Define the function to query the Hugging Face Inference API
def query(api_url, payload=None, data=None):
    if data is not None:
        response = requests.post(api_url, headers=headers, data=data)
    else:
        response = requests.post(api_url, headers=headers, json=payload)
    response_json = response.json()
    if 'error' in response_json:
        print(f"Error in query function: {response_json['error']}")
        return None
    return response_json


# Define the function to translate speech
def translate_speech(audio_file):
    print(f"Type of audio: {type(audio_file)}, Value of audio: {audio_file}")  # Debug line

    # Use the ASR pipeline to transcribe the audio
    data = audio_file.read()
    output = query(ASR_API_URL, data=data)
    print(f"Output: {output}")  # Debug line

    # Check if 'error' key exists in the output
    if 'error' in output:
        print(f"Error: {output['error']}")
        estimated_time = output.get('estimated_time')
        if estimated_time:
            print(f"Estimated time for the model to load: {estimated_time} seconds")
        return

    # Check if 'text' key exists in the output
    if 'text' in output:
        transcription = output["text"]
    else:
        print("Key 'text' does not exist in the output.")
        return

    # Use the translation pipeline to translate the transcription
    translated_text = query(TRANSLATION_API_URL, {"inputs": transcription})

    # Use the TTS pipeline to synthesize the translated text
    response = requests.post(TTS_API_URL, headers=headers, json={"inputs": translated_text})
    audio_bytes = response.content

    # Display the audio output
    return Audio(audio_bytes)

    # print(f"Type of audio: {type(audio_file)}, Value of audio: {audio_file}")  # Debug line

    # # Use the ASR pipeline to transcribe the audio
    # data = audio_file.read()
    # output = query(ASR_API_URL, data=data)
    # print(f"Output: {output}")  # Debug line

    # # Check if 'text' key exists in the output
    # if 'text' in output:
    #     transcription = output["text"]
    # else:
    #     print("Key 'text' does not exist in the output.")
    #     return

    # # Use the translation pipeline to translate the transcription
    # translated_text = query(TRANSLATION_API_URL, {"inputs": transcription})

    # # Use the TTS pipeline to synthesize the translated text
    # response = requests.post(TTS_API_URL, headers=headers, json={"inputs": translated_text})
    # audio_bytes = response.content

    # # Display the audio output
    # return Audio(audio_bytes)


# Define the Gradio interface
iface = gr.Interface(
    fn=translate_speech, 
    inputs=gr.inputs.File(type="file"),  # Change this line
    outputs=gr.outputs.Audio(type="numpy"),
    title="Hausa to English Translation",
    description="Realtime demo for Hausa to English translation using speech recognition and text-to-speech synthesis."
)

iface.launch()