import os import io import numpy as np import gradio as gr import requests from gtts import gTTS from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq from pydub import AudioSegment # Load environment variables for API keys RAPIDAPI_KEY = os.getenv('RAPIDAPI_LANG_TRANS') GROQ_API_KEY = os.getenv('GROQ_API_KEY') # Load the Whisper model processor = AutoProcessor.from_pretrained("ihanif/whisper-medium-urdu") model = AutoModelForSpeechSeq2Seq.from_pretrained("ihanif/whisper-medium-urdu") # Function to translate text using Microsoft Translator def translate(target, text): url = "https://microsoft-translator-text.p.rapidapi.com/translate" querystring = {"api-version":"3.0","profanityAction":"NoAction","textType":"plain", "to":target} payload = [{"Text": text}] headers = { "x-rapidapi-key": RAPIDAPI_KEY, "x-rapidapi-host": "microsoft-translator-text.p.rapidapi.com", "Content-Type": "application/json" } response = requests.post(url, json=payload, headers=headers, params=querystring) res = response.json() return res[0]["translations"][0]["text"] # Function to process audio and generate a response def process_audio(file_path): try: # Load and preprocess the audio file if file_path.endswith(".m4a"): audio = AudioSegment.from_file(file_path, format="m4a") else: audio = AudioSegment.from_file(file_path) audio = audio.set_frame_rate(16000) # Whisper requires a 16kHz sample rate audio = audio.set_channels(1) # Whisper expects mono audio # Convert audio to numpy array for processing audio_samples = np.array(audio.get_array_of_samples(), dtype=np.float32) / 32768.0 # Normalize to [-1, 1] range audio_input = processor(audio_samples, return_tensors="pt", sampling_rate=16000) # Transcribe the audio using the fine-tuned Whisper model result = model.generate(**audio_input) text = processor.batch_decode(result, skip_special_tokens=True)[0] if not text.strip(): # Check if the transcribed text is empty return "No speech detected in the audio file.", None print(f"Transcribed Text (Urdu): {text}") # Debugging step # Translate the transcribed Urdu text to English urdu_to_eng = translate("en", text) print(f"Translated Text (English): {urdu_to_eng}") # Debugging step # Make API call to Groq groq_url = "https://api.groq.com/your-endpoint" # Replace with actual Groq API endpoint groq_headers = { "Authorization": f"Bearer {GROQ_API_KEY}", "Content-Type": "application/json" } groq_payload = { "messages": [{"role": "user", "content": urdu_to_eng}], "model": "llama3-8b-8192", # Adjust model if needed "max_tokens": 50 } response = requests.post(groq_url, json=groq_payload, headers=groq_headers) chat_completion = response.json() # Access the response response_message = chat_completion["choices"][0]["message"]["content"].strip() print(f"Groq Response (English): {response_message}") # Debugging step # Translate the response text back to Urdu eng_to_urdu = translate("ur", response_message) print(f"Translated Response (Urdu): {eng_to_urdu}") # Debugging step # Convert the response text to Urdu speech tts = gTTS(text=eng_to_urdu, lang="ur") response_audio_io = io.BytesIO() tts.write_to_fp(response_audio_io) # Save the audio to the BytesIO object response_audio_io.seek(0) # Save audio to a file to ensure it's generated correctly with open("response.mp3", "wb") as audio_file: audio_file.write(response_audio_io.getvalue()) # Return the response text and the path to the saved audio file return eng_to_urdu, "response.mp3" except Exception as e: return f"An error occurred: {e}", None # Gradio interface to handle the audio input and output iface = gr.Interface( fn=process_audio, inputs=gr.Audio(type="filepath"), # Use type="filepath" outputs=[gr.Textbox(label="Response Text"), gr.Audio(label="Response Audio")], live=True ) iface.launch()