Spaces:

Adeenakk
/

urduvoice

Runtime error

App Files Files Community

Adeenakk commited on 30 days ago

Commit

bab0937

•

1 Parent(s): 2d5360e

Create app.py

Browse files

Files changed (1) hide show

app.py +103 -0

app.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import os
+import io
+import numpy as np
+import gradio as gr
+import requests
+from gtts import gTTS
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
+from pydub import AudioSegment
+from groq import Groq
+RAPIDAPI_KEY = userdata.get('RAPIDAPI_LANG_TRANS')
+GROQ_API_KEY = userdata.get('GROQ_API_KEY')
+# Initialize the Groq client
+client = Groq(api_key=GROQ_API_KEY)
+# Load the Whisper model
+processor = AutoProcessor.from_pretrained("ihanif/whisper-medium-urdu")
+model = AutoModelForSpeechSeq2Seq.from_pretrained("ihanif/whisper-medium-urdu")
+# Function to translate text using Microsoft Translator
+def translate(target, text):
+    url = "https://microsoft-translator-text.p.rapidapi.com/translate"
+    querystring = {"api-version":"3.0","profanityAction":"NoAction","textType":"plain", "to":target}
+    payload = [{"Text": text}]
+    headers = {
+        "x-rapidapi-key": RAPIDAPI_KEY,
+        "x-rapidapi-host": "microsoft-translator-text.p.rapidapi.com",
+        "Content-Type": "application/json"
+    }
+    response = requests.post(url, json=payload, headers=headers, params=querystring)
+    res = response.json()
+    return res[0]["translations"][0]["text"]
+# Function to process audio and generate a response
+def process_audio(file_path):
+    try:
+        # Load and preprocess the audio file
+        if file_path.endswith(".m4a"):
+            audio = AudioSegment.from_file(file_path, format="m4a")
+        else:
+            audio = AudioSegment.from_file(file_path)
+        audio = audio.set_frame_rate(16000)  # Whisper requires a 16kHz sample rate
+        audio = audio.set_channels(1)  # Whisper expects mono audio
+        # Convert audio to numpy array for processing
+        audio_samples = np.array(audio.get_array_of_samples(), dtype=np.float32) / 32768.0  # Normalize to [-1, 1] range
+        audio_input = processor(audio_samples, return_tensors="pt", sampling_rate=16000)
+        # Transcribe the audio using the fine-tuned Whisper model
+        result = model.generate(**audio_input)
+        text = processor.batch_decode(result, skip_special_tokens=True)[0]
+        if not text.strip():  # Check if the transcribed text is empty
+            return "No speech detected in the audio file.", None
+        print(f"Transcribed Text (Urdu): {text}")  # Debugging step
+        # Translate the transcribed Urdu text to English
+        urdu_to_eng = translate("en", text)
+        print(f"Translated Text (English): {urdu_to_eng}")  # Debugging step
+        # Generate a response using Groq
+        chat_completion = client.chat.completions.create(
+            messages=[{"role": "user", "content": urdu_to_eng}],
+            model="llama3-8b-8192",  # Ensure the model supports Urdu if possible
+            max_tokens=50
+        )
+        # Access the response using dot notation
+        response_message = chat_completion.choices[0].message.content.strip()
+        print(f"Groq Response (English): {response_message}")  # Debugging step
+        # Translate the response text back to Urdu
+        eng_to_urdu = translate("ur", response_message)
+        print(f"Translated Response (Urdu): {eng_to_urdu}")  # Debugging step
+        # Convert the response text to Urdu speech
+        tts = gTTS(text=eng_to_urdu, lang="ur")
+        response_audio_io = io.BytesIO()
+        tts.write_to_fp(response_audio_io)  # Save the audio to the BytesIO object
+        response_audio_io.seek(0)
+        # Save audio to a file to ensure it's generated correctly
+        with open("response.mp3", "wb") as audio_file:
+            audio_file.write(response_audio_io.getvalue())
+        # Return the response text and the path to the saved audio file
+        return eng_to_urdu, "response.mp3"
+    except Exception as e:
+        return f"An error occurred: {e}", None
+# Gradio interface to handle the audio input and output
+iface = gr.Interface(
+    fn=process_audio,
+    inputs=gr.Audio(type="filepath"),  # Use type="filepath"
+    outputs=[gr.Textbox(label="Response Text"), gr.Audio(label="Response Audio")],
+    live=True
+)
+iface.launch()