|
import os |
|
import io |
|
import numpy as np |
|
import gradio as gr |
|
import requests |
|
from gtts import gTTS |
|
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq |
|
from pydub import AudioSegment |
|
from groq import Groq |
|
|
|
from google.colab import userdata |
|
RAPIDAPI_KEY = userdata.get('RAPIDAPI_LANG_TRANS') |
|
GROQ_API_KEY = userdata.get('GROQ_API_KEY') |
|
|
|
|
|
client = Groq(api_key=GROQ_API_KEY) |
|
|
|
|
|
processor = AutoProcessor.from_pretrained("ihanif/whisper-medium-urdu") |
|
model = AutoModelForSpeechSeq2Seq.from_pretrained("ihanif/whisper-medium-urdu") |
|
|
|
|
|
def translate(target, text): |
|
url = "https://microsoft-translator-text.p.rapidapi.com/translate" |
|
querystring = {"api-version":"3.0","profanityAction":"NoAction","textType":"plain", "to":target} |
|
payload = [{"Text": text}] |
|
headers = { |
|
"x-rapidapi-key": RAPIDAPI_KEY, |
|
"x-rapidapi-host": "microsoft-translator-text.p.rapidapi.com", |
|
"Content-Type": "application/json" |
|
} |
|
response = requests.post(url, json=payload, headers=headers, params=querystring) |
|
res = response.json() |
|
return res[0]["translations"][0]["text"] |
|
|
|
|
|
def process_audio(file_path): |
|
try: |
|
|
|
if file_path.endswith(".m4a"): |
|
audio = AudioSegment.from_file(file_path, format="m4a") |
|
else: |
|
audio = AudioSegment.from_file(file_path) |
|
|
|
audio = audio.set_frame_rate(16000) |
|
audio = audio.set_channels(1) |
|
|
|
|
|
audio_samples = np.array(audio.get_array_of_samples(), dtype=np.float32) / 32768.0 |
|
|
|
|
|
|
|
|
|
|
|
attention_mask = np.ones_like(audio_samples, dtype=np.int64) |
|
|
|
audio_input = processor(audio_samples, return_tensors="pt", sampling_rate=16000) |
|
|
|
|
|
|
|
result = model.generate( |
|
**audio_input, |
|
attention_mask=torch.tensor(attention_mask).unsqueeze(0) |
|
) |
|
text = processor.batch_decode(result, skip_special_tokens=True)[0] |
|
|
|
if not text.strip(): |
|
return "No speech detected in the audio file.", None |
|
|
|
print(f"Transcribed Text (Urdu): {text}") |
|
|
|
|
|
urdu_to_eng = translate("en", text) |
|
print(f"Translated Text (English): {urdu_to_eng}") |
|
|
|
|
|
chat_completion = client.chat.completions.create( |
|
messages=[{"role": "user", "content": urdu_to_eng}], |
|
model="llama3-8b-8192", |
|
max_tokens=50 |
|
) |
|
|
|
|
|
response_message = chat_completion.choices[0].message.content.strip() |
|
print(f"Groq Response (English): {response_message}") |
|
|
|
|
|
eng_to_urdu = translate("ur", response_message) |
|
print(f"Translated Response (Urdu): {eng_to_urdu}") |
|
|
|
|
|
tts = gTTS(text=eng_to_urdu, lang="ur") |
|
response_audio_io = io.BytesIO() |
|
tts.write_to_fp(response_audio_io) |
|
response_audio_io.seek(0) |
|
|
|
|
|
with open("response.mp3", "wb") as audio_file: |
|
audio_file.write(response_audio_io.getvalue()) |
|
|
|
|
|
return eng_to_urdu, "response.mp3" |
|
|
|
except Exception as e: |
|
return f"An error occurred: {e}", None |
|
|
|
|
|
iface = gr.Interface( |
|
fn=process_audio, |
|
inputs=gr.Audio(type="filepath"), |
|
outputs=[gr.Textbox(label="Response Text"), gr.Audio(label="Response Audio")], |
|
live=True |
|
) |
|
|
|
iface.launch(share=True) |
|
|