File size: 4,662 Bytes
bab0937
 
 
 
 
 
 
 
2d5eff6
bab0937
2d5eff6
 
 
 
 
 
bab0937
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d5eff6
 
 
 
 
 
 
bab0937
 
 
2d5eff6
 
 
 
 
bab0937
 
 
 
 
 
 
 
 
 
 
2d5eff6
 
 
 
 
 
 
 
 
bab0937
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d5eff6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import io
import numpy as np
import gradio as gr
import requests
from gtts import gTTS
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
from pydub import AudioSegment
from groq import Groq

from google.colab import userdata
RAPIDAPI_KEY = userdata.get('RAPIDAPI_LANG_TRANS')
GROQ_API_KEY = userdata.get('GROQ_API_KEY')

# Initialize the Groq client
client = Groq(api_key=GROQ_API_KEY)

# Load the Whisper model
processor = AutoProcessor.from_pretrained("ihanif/whisper-medium-urdu")
model = AutoModelForSpeechSeq2Seq.from_pretrained("ihanif/whisper-medium-urdu")

# Function to translate text using Microsoft Translator
def translate(target, text):
    url = "https://microsoft-translator-text.p.rapidapi.com/translate"
    querystring = {"api-version":"3.0","profanityAction":"NoAction","textType":"plain", "to":target}
    payload = [{"Text": text}]
    headers = {
        "x-rapidapi-key": RAPIDAPI_KEY,
        "x-rapidapi-host": "microsoft-translator-text.p.rapidapi.com",
        "Content-Type": "application/json"
    }
    response = requests.post(url, json=payload, headers=headers, params=querystring)
    res = response.json()
    return res[0]["translations"][0]["text"]

# Function to process audio and generate a response
def process_audio(file_path):
    try:
        # Load and preprocess the audio file
        if file_path.endswith(".m4a"):
            audio = AudioSegment.from_file(file_path, format="m4a")
        else:
            audio = AudioSegment.from_file(file_path)
        
        audio = audio.set_frame_rate(16000)  # Whisper requires a 16kHz sample rate
        audio = audio.set_channels(1)  # Whisper expects mono audio
        
        # Convert audio to numpy array for processing
        audio_samples = np.array(audio.get_array_of_samples(), dtype=np.float32) / 32768.0  # Normalize to [-1, 1] range
        
        # Create attention mask
        # Assume padding length is determined by the maximum length of sequences
        # For simplicity, we'll just create a mask where all values are 1 (no padding)
        # In practice, you would adjust this based on actual sequence length
        attention_mask = np.ones_like(audio_samples, dtype=np.int64)

        audio_input = processor(audio_samples, return_tensors="pt", sampling_rate=16000)
        
        # Transcribe the audio using the fine-tuned Whisper model
        # Pass the attention mask as well
        result = model.generate(
            **audio_input,
            attention_mask=torch.tensor(attention_mask).unsqueeze(0)  # Add batch dimension
        )
        text = processor.batch_decode(result, skip_special_tokens=True)[0]
        
        if not text.strip():  # Check if the transcribed text is empty
            return "No speech detected in the audio file.", None
        
        print(f"Transcribed Text (Urdu): {text}")  # Debugging step

        # Translate the transcribed Urdu text to English
        urdu_to_eng = translate("en", text)
        print(f"Translated Text (English): {urdu_to_eng}")  # Debugging step

        # Generate a response using Groq
        chat_completion = client.chat.completions.create(
            messages=[{"role": "user", "content": urdu_to_eng}],
            model="llama3-8b-8192",  # Ensure the model supports Urdu if possible
            max_tokens=50
        )

        # Access the response using dot notation
        response_message = chat_completion.choices[0].message.content.strip()
        print(f"Groq Response (English): {response_message}")  # Debugging step

        # Translate the response text back to Urdu
        eng_to_urdu = translate("ur", response_message)
        print(f"Translated Response (Urdu): {eng_to_urdu}")  # Debugging step

        # Convert the response text to Urdu speech
        tts = gTTS(text=eng_to_urdu, lang="ur")
        response_audio_io = io.BytesIO()
        tts.write_to_fp(response_audio_io)  # Save the audio to the BytesIO object
        response_audio_io.seek(0)

        # Save audio to a file to ensure it's generated correctly
        with open("response.mp3", "wb") as audio_file:
            audio_file.write(response_audio_io.getvalue())

        # Return the response text and the path to the saved audio file
        return eng_to_urdu, "response.mp3"

    except Exception as e:
        return f"An error occurred: {e}", None

# Gradio interface to handle the audio input and output
iface = gr.Interface(
    fn=process_audio,
    inputs=gr.Audio(type="filepath"),  # Use type="filepath"
    outputs=[gr.Textbox(label="Response Text"), gr.Audio(label="Response Audio")],
    live=True
)

iface.launch(share=True)