Hunzla commited on
Commit
7c7805e
1 Parent(s): c5fe8de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -50
app.py CHANGED
@@ -1,58 +1,50 @@
1
  from transformers import pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import gradio as gr
3
- from pyannote.core import Annotation
4
- from pydub import AudioSegment
5
- import torchaudio
6
- from pyannote.audio import Pipeline
7
 
8
- diarization_pipe = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1",
9
- use_auth_token="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") # change to "your-username/the-name-you-picked"
10
-
11
-
12
-
13
- # Load the speech-to-text model (Whisper)
14
- asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
15
-
16
- def transcribe_with_diarization(audio_path):
17
- # Get speaker segments using the diarization model
18
- diarization_result = diarization_pipe(audio_path)
19
-
20
- # Extract speaker segments and transcribe them using Whisper ASR
21
- transcripts = []
22
- for track, segment,speaker in diarization_result.itertracks(yield_label=True):
23
-
24
- print(segment)
25
- print(speaker)
26
-
27
- start_time = track.start
28
- end_time = track.end
29
- print(start_time)
30
- print(end_time)
31
- label = segment # Extract the label manually
32
- waveform, sample_rate = torchaudio.load(audio_path, normalize=True)
33
- start_sample = int(start_time * sample_rate)
34
- end_sample = int(end_time * sample_rate)
35
- print(waveform)
36
- interval_audio = waveform[:,start_sample:end_sample]
37
- # Export the interval audio as a temporary WAV file
38
- torchaudio.save("interval_audio.wav", interval_audio,sample_rate)
39
- transcript = asr_pipe("interval_audio.wav")
40
- print(transcript)
41
- transcripts.append(transcript)
42
-
43
- # Combine the transcriptions from all speakers
44
- text = " ".join(transcripts)
45
- return text
46
 
47
  iface = gr.Interface(
48
- fn=transcribe_with_diarization,
49
- inputs=[
50
- gr.File(label="Audio File"),
51
- gr.Audio(source="microphone", type="filepath", filetype="mp3")
52
- ],
53
  outputs="text",
54
- title="Whisper Large Hindi with Speaker Diarization",
55
- description="Real-time demo for Hindi speech recognition using a fine-tuned Whisper large model with speaker diarization.",
56
  )
57
 
58
- iface.launch()
 
1
  from transformers import pipeline
2
+ asr_pipe = pipeline("automatic-speech-recognition", model="ihanif/whisper-medium-urdu")
3
+ from difflib import SequenceMatcher
4
+
5
+ # List of commands
6
+ commands = [
7
+ "کمپیوٹر، کھیل کھیلو",
8
+ "میوزک چلاؤ",
9
+ "روشنی کم کریں"
10
+ ]
11
+ replies = [
12
+ "https://medicobilling.info/urdu.wav",
13
+ "download.wav",
14
+ "https://medicobilling.info/urdu.wav"
15
+ ]
16
+ # Function to find the most similar command
17
+ def find_most_similar_command(statement, command_list):
18
+ best_match = None
19
+ highest_similarity = 0
20
+ i=0
21
+ for command in command_list:
22
+ similarity = SequenceMatcher(None, statement, command).ratio()
23
+ if similarity > highest_similarity:
24
+ highest_similarity = similarity
25
+ best_match = command
26
+ reply=replies[i]
27
+ i+=1
28
+ else:
29
+ best_match="unknown"
30
+ reply="unknown.wav"
31
+ return best_match,reply
32
+ def transcribe_the_command(audio_path):
33
+ transcript = asr_pipe(audio_path)["text"]
34
+ most_similar_command,reply = find_most_similar_command(transcript, commands)
35
+ print(f"Given Statement: {transcript}")
36
+ print(f"Most Similar Command: {most_similar_command}\n")
37
+ return reply
38
+ # get_text_from_voice("urdu.wav")
39
  import gradio as gr
 
 
 
 
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  iface = gr.Interface(
43
+ fn=transcribe_the_command,
44
+ inputs=gr.Audio(source="microphone"),
 
 
 
45
  outputs="text",
46
+ title="Whisper Small Hindi",
47
+ description="Realtime demo for Hindi speech recognition using a fine-tuned Whisper small model.",
48
  )
49
 
50
+ iface.launch(share="true")