Spaces:

Adeenakk
/

urduvoice

Runtime error

App Files Files Community

urduvoice / app.py

Adeenakk

Create app.py

bab0937 verified 3 months ago

raw

history blame

4.1 kB

	import os
	import io
	import numpy as np
	import gradio as gr
	import requests
	from gtts import gTTS
	from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
	from pydub import AudioSegment
	from groq import Groq

	RAPIDAPI_KEY = userdata.get('RAPIDAPI_LANG_TRANS')
	GROQ_API_KEY = userdata.get('GROQ_API_KEY')

	# Initialize the Groq client
	client = Groq(api_key=GROQ_API_KEY)

	# Load the Whisper model
	processor = AutoProcessor.from_pretrained("ihanif/whisper-medium-urdu")
	model = AutoModelForSpeechSeq2Seq.from_pretrained("ihanif/whisper-medium-urdu")

	# Function to translate text using Microsoft Translator
	def translate(target, text):
	url = "https://microsoft-translator-text.p.rapidapi.com/translate"
	querystring = {"api-version":"3.0","profanityAction":"NoAction","textType":"plain", "to":target}
	payload = [{"Text": text}]
	headers = {
	"x-rapidapi-key": RAPIDAPI_KEY,
	"x-rapidapi-host": "microsoft-translator-text.p.rapidapi.com",
	"Content-Type": "application/json"
	}
	response = requests.post(url, json=payload, headers=headers, params=querystring)
	res = response.json()
	return res[0]["translations"][0]["text"]

	# Function to process audio and generate a response
	def process_audio(file_path):
	try:
	# Load and preprocess the audio file
	if file_path.endswith(".m4a"):
	audio = AudioSegment.from_file(file_path, format="m4a")
	else:
	audio = AudioSegment.from_file(file_path)

	audio = audio.set_frame_rate(16000) # Whisper requires a 16kHz sample rate
	audio = audio.set_channels(1) # Whisper expects mono audio

	# Convert audio to numpy array for processing
	audio_samples = np.array(audio.get_array_of_samples(), dtype=np.float32) / 32768.0 # Normalize to [-1, 1] range
	audio_input = processor(audio_samples, return_tensors="pt", sampling_rate=16000)

	# Transcribe the audio using the fine-tuned Whisper model
	result = model.generate(**audio_input)
	text = processor.batch_decode(result, skip_special_tokens=True)[0]

	if not text.strip(): # Check if the transcribed text is empty
	return "No speech detected in the audio file.", None

	print(f"Transcribed Text (Urdu): {text}") # Debugging step

	# Translate the transcribed Urdu text to English
	urdu_to_eng = translate("en", text)
	print(f"Translated Text (English): {urdu_to_eng}") # Debugging step

	# Generate a response using Groq
	chat_completion = client.chat.completions.create(
	messages=[{"role": "user", "content": urdu_to_eng}],
	model="llama3-8b-8192", # Ensure the model supports Urdu if possible
	max_tokens=50
	)

	# Access the response using dot notation
	response_message = chat_completion.choices[0].message.content.strip()
	print(f"Groq Response (English): {response_message}") # Debugging step

	# Translate the response text back to Urdu
	eng_to_urdu = translate("ur", response_message)
	print(f"Translated Response (Urdu): {eng_to_urdu}") # Debugging step

	# Convert the response text to Urdu speech
	tts = gTTS(text=eng_to_urdu, lang="ur")
	response_audio_io = io.BytesIO()
	tts.write_to_fp(response_audio_io) # Save the audio to the BytesIO object
	response_audio_io.seek(0)

	# Save audio to a file to ensure it's generated correctly
	with open("response.mp3", "wb") as audio_file:
	audio_file.write(response_audio_io.getvalue())

	# Return the response text and the path to the saved audio file
	return eng_to_urdu, "response.mp3"

	except Exception as e:
	return f"An error occurred: {e}", None

	# Gradio interface to handle the audio input and output
	iface = gr.Interface(
	fn=process_audio,
	inputs=gr.Audio(type="filepath"), # Use type="filepath"
	outputs=[gr.Textbox(label="Response Text"), gr.Audio(label="Response Audio")],
	live=True
	)

	iface.launch()