Adeenakk commited on
Commit
bab0937
1 Parent(s): 2d5360e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -0
app.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import numpy as np
4
+ import gradio as gr
5
+ import requests
6
+ from gtts import gTTS
7
+ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
8
+ from pydub import AudioSegment
9
+ from groq import Groq
10
+
11
+ RAPIDAPI_KEY = userdata.get('RAPIDAPI_LANG_TRANS')
12
+ GROQ_API_KEY = userdata.get('GROQ_API_KEY')
13
+
14
+ # Initialize the Groq client
15
+ client = Groq(api_key=GROQ_API_KEY)
16
+
17
+ # Load the Whisper model
18
+ processor = AutoProcessor.from_pretrained("ihanif/whisper-medium-urdu")
19
+ model = AutoModelForSpeechSeq2Seq.from_pretrained("ihanif/whisper-medium-urdu")
20
+
21
+ # Function to translate text using Microsoft Translator
22
+ def translate(target, text):
23
+ url = "https://microsoft-translator-text.p.rapidapi.com/translate"
24
+ querystring = {"api-version":"3.0","profanityAction":"NoAction","textType":"plain", "to":target}
25
+ payload = [{"Text": text}]
26
+ headers = {
27
+ "x-rapidapi-key": RAPIDAPI_KEY,
28
+ "x-rapidapi-host": "microsoft-translator-text.p.rapidapi.com",
29
+ "Content-Type": "application/json"
30
+ }
31
+ response = requests.post(url, json=payload, headers=headers, params=querystring)
32
+ res = response.json()
33
+ return res[0]["translations"][0]["text"]
34
+
35
+ # Function to process audio and generate a response
36
+ def process_audio(file_path):
37
+ try:
38
+ # Load and preprocess the audio file
39
+ if file_path.endswith(".m4a"):
40
+ audio = AudioSegment.from_file(file_path, format="m4a")
41
+ else:
42
+ audio = AudioSegment.from_file(file_path)
43
+
44
+ audio = audio.set_frame_rate(16000) # Whisper requires a 16kHz sample rate
45
+ audio = audio.set_channels(1) # Whisper expects mono audio
46
+
47
+ # Convert audio to numpy array for processing
48
+ audio_samples = np.array(audio.get_array_of_samples(), dtype=np.float32) / 32768.0 # Normalize to [-1, 1] range
49
+ audio_input = processor(audio_samples, return_tensors="pt", sampling_rate=16000)
50
+
51
+ # Transcribe the audio using the fine-tuned Whisper model
52
+ result = model.generate(**audio_input)
53
+ text = processor.batch_decode(result, skip_special_tokens=True)[0]
54
+
55
+ if not text.strip(): # Check if the transcribed text is empty
56
+ return "No speech detected in the audio file.", None
57
+
58
+ print(f"Transcribed Text (Urdu): {text}") # Debugging step
59
+
60
+ # Translate the transcribed Urdu text to English
61
+ urdu_to_eng = translate("en", text)
62
+ print(f"Translated Text (English): {urdu_to_eng}") # Debugging step
63
+
64
+ # Generate a response using Groq
65
+ chat_completion = client.chat.completions.create(
66
+ messages=[{"role": "user", "content": urdu_to_eng}],
67
+ model="llama3-8b-8192", # Ensure the model supports Urdu if possible
68
+ max_tokens=50
69
+ )
70
+
71
+ # Access the response using dot notation
72
+ response_message = chat_completion.choices[0].message.content.strip()
73
+ print(f"Groq Response (English): {response_message}") # Debugging step
74
+
75
+ # Translate the response text back to Urdu
76
+ eng_to_urdu = translate("ur", response_message)
77
+ print(f"Translated Response (Urdu): {eng_to_urdu}") # Debugging step
78
+
79
+ # Convert the response text to Urdu speech
80
+ tts = gTTS(text=eng_to_urdu, lang="ur")
81
+ response_audio_io = io.BytesIO()
82
+ tts.write_to_fp(response_audio_io) # Save the audio to the BytesIO object
83
+ response_audio_io.seek(0)
84
+
85
+ # Save audio to a file to ensure it's generated correctly
86
+ with open("response.mp3", "wb") as audio_file:
87
+ audio_file.write(response_audio_io.getvalue())
88
+
89
+ # Return the response text and the path to the saved audio file
90
+ return eng_to_urdu, "response.mp3"
91
+
92
+ except Exception as e:
93
+ return f"An error occurred: {e}", None
94
+
95
+ # Gradio interface to handle the audio input and output
96
+ iface = gr.Interface(
97
+ fn=process_audio,
98
+ inputs=gr.Audio(type="filepath"), # Use type="filepath"
99
+ outputs=[gr.Textbox(label="Response Text"), gr.Audio(label="Response Audio")],
100
+ live=True
101
+ )
102
+
103
+ iface.launch()