Spaces:

Baghdad99
/

ha-en

Sleeping

App Files Files Community

ha-en / app.py

Baghdad99

Update app.py

9829b9c 11 months ago

raw

history blame

3.01 kB

	import gradio as gr
	import requests
	import soundfile as sf
	import numpy as np
	import tempfile
	from pydub import AudioSegment
	import io

	# Define the Hugging Face Inference API URLs and headers
	ASR_API_URL = "https://api-inference.huggingface.co/models/Baghdad99/saad-speech-recognition-hausa-audio-to-text"
	TTS_API_URL = "https://api-inference.huggingface.co/models/Baghdad99/english_voice_tts"
	TRANSLATION_API_URL = "https://api-inference.huggingface.co/models/Baghdad99/saad-hausa-text-to-english-text"
	headers = {"Authorization": "Bearer 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"}

	# Define the function to query the Hugging Face Inference API
	def query(api_url, payload):
	response = requests.post(api_url, headers=headers, json=payload)
	return response.json()

	# Define the function to translate speech
	def translate_speech(audio):
	print(f"Type of audio: {type(audio)}, Value of audio: {audio}") # Debug line

	# audio is a tuple (np.ndarray, int), we need to save it as a file
	sample_rate, audio_data = audio
	if isinstance(audio_data, np.ndarray) and len(audio_data.shape) == 1: # if audio_data is 1D, reshape it to 2D
	audio_data = np.reshape(audio_data, (-1, 1))
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
	sf.write(f, audio_data, sample_rate)
	audio_file = f.name

	# Convert the WAV file to MP3
	audio_segment = AudioSegment.from_wav(audio_file)
	mp3_file = audio_file.replace(".wav", ".mp3")
	audio_segment.export(mp3_file, format="mp3")

	# Use the ASR pipeline to transcribe the audio
	with open(mp3_file, "rb") as f: # Change this line
	data = f.read()
	response = requests.post(ASR_API_URL, headers=headers, data=data)
	output = response.json()

	# Check if the output contains 'text'
	if 'text' in output:
	transcription = output["text"]
	else:
	print("The output does not contain 'text'")
	return

	# Use the translation pipeline to translate the transcription
	translated_text = query(TRANSLATION_API_URL, {"inputs": transcription})

	# Use the TTS pipeline to synthesize the translated text
	response = requests.post(TTS_API_URL, headers=headers, json={"inputs": translated_text})
	audio_bytes = response.content

	# Convert the audio bytes to an audio segment
	audio_segment = AudioSegment.from_mp3(io.BytesIO(audio_bytes)) # Change this line

	# Convert the audio segment to a numpy array
	audio_data = np.array(audio_segment.get_array_of_samples())
	if audio_segment.channels == 2:
	audio_data = audio_data.reshape((-1, 2))

	return audio_data

	# Define the Gradio interface
	iface = gr.Interface(
	fn=translate_speech,
	inputs=gr.inputs.Audio(source="microphone", type="numpy"),
	outputs=gr.outputs.Audio(type="numpy"),
	title="Hausa to English Translation",
	description="Realtime demo for Hausa to English translation using speech recognition and text-to-speech synthesis."
	)

	iface.launch()