import os import io import numpy as np import gradio as gr import requests from gtts import gTTS from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq from pydub import AudioSegment from groq import Groq from google.colab import userdata RAPIDAPI_KEY = userdata.get('RAPIDAPI_LANG_TRANS') GROQ_API_KEY = userdata.get('GROQ_API_KEY') # Initialize the Groq client client = Groq(api_key=GROQ_API_KEY) # Load the Whisper model processor = AutoProcessor.from_pretrained("ihanif/whisper-medium-urdu") model = AutoModelForSpeechSeq2Seq.from_pretrained("ihanif/whisper-medium-urdu") # Function to translate text using Microsoft Translator def translate(target, text): url = "https://microsoft-translator-text.p.rapidapi.com/translate" querystring = {"api-version":"3.0","profanityAction":"NoAction","textType":"plain", "to":target} payload = [{"Text": text}] headers = { "x-rapidapi-key": RAPIDAPI_KEY, "x-rapidapi-host": "microsoft-translator-text.p.rapidapi.com", "Content-Type": "application/json" } response = requests.post(url, json=payload, headers=headers, params=querystring) res = response.json() return res[0]["translations"][0]["text"] # Function to process audio and generate a response def process_audio(file_path): try: # Load and preprocess the audio file if file_path.endswith(".m4a"): audio = AudioSegment.from_file(file_path, format="m4a") else: audio = AudioSegment.from_file(file_path) audio = audio.set_frame_rate(16000) # Whisper requires a 16kHz sample rate audio = audio.set_channels(1) # Whisper expects mono audio # Convert audio to numpy array for processing audio_samples = np.array(audio.get_array_of_samples(), dtype=np.float32) / 32768.0 # Normalize to [-1, 1] range # Create attention mask # Assume padding length is determined by the maximum length of sequences # For simplicity, we'll just create a mask where all values are 1 (no padding) # In practice, you would adjust this based on actual sequence length attention_mask = np.ones_like(audio_samples, dtype=np.int64) audio_input = processor(audio_samples, return_tensors="pt", sampling_rate=16000) # Transcribe the audio using the fine-tuned Whisper model # Pass the attention mask as well result = model.generate( **audio_input, attention_mask=torch.tensor(attention_mask).unsqueeze(0) # Add batch dimension ) text = processor.batch_decode(result, skip_special_tokens=True)[0] if not text.strip(): # Check if the transcribed text is empty return "No speech detected in the audio file.", None print(f"Transcribed Text (Urdu): {text}") # Debugging step # Translate the transcribed Urdu text to English urdu_to_eng = translate("en", text) print(f"Translated Text (English): {urdu_to_eng}") # Debugging step # Generate a response using Groq chat_completion = client.chat.completions.create( messages=[{"role": "user", "content": urdu_to_eng}], model="llama3-8b-8192", # Ensure the model supports Urdu if possible max_tokens=50 ) # Access the response using dot notation response_message = chat_completion.choices[0].message.content.strip() print(f"Groq Response (English): {response_message}") # Debugging step # Translate the response text back to Urdu eng_to_urdu = translate("ur", response_message) print(f"Translated Response (Urdu): {eng_to_urdu}") # Debugging step # Convert the response text to Urdu speech tts = gTTS(text=eng_to_urdu, lang="ur") response_audio_io = io.BytesIO() tts.write_to_fp(response_audio_io) # Save the audio to the BytesIO object response_audio_io.seek(0) # Save audio to a file to ensure it's generated correctly with open("response.mp3", "wb") as audio_file: audio_file.write(response_audio_io.getvalue()) # Return the response text and the path to the saved audio file return eng_to_urdu, "response.mp3" except Exception as e: return f"An error occurred: {e}", None # Gradio interface to handle the audio input and output iface = gr.Interface( fn=process_audio, inputs=gr.Audio(type="filepath"), # Use type="filepath" outputs=[gr.Textbox(label="Response Text"), gr.Audio(label="Response Audio")], live=True ) iface.launch(share=True)