Hunzla commited on
Commit
c009378
1 Parent(s): bec296b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -9
app.py CHANGED
@@ -1,18 +1,60 @@
1
  from transformers import pipeline
2
  import gradio as gr
 
 
 
 
3
 
4
- pipe = pipeline(model="Hunzla/whisper-small-hi") # change to "your-username/the-name-you-picked"
 
5
 
6
- def transcribe(audio):
7
- text = pipe(audio)["text"]
8
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  iface = gr.Interface(
11
- fn=transcribe,
12
- inputs=gr.Audio(source="microphone", type="filepath"),
13
  outputs="text",
14
- title="Whisper Small Hindi",
15
- description="Realtime demo for Hindi speech recognition using a fine-tuned Whisper small model.",
16
  )
17
 
18
- iface.launch()
 
1
  from transformers import pipeline
2
  import gradio as gr
3
+ from pyannote.core import Annotation
4
+ from pydub import AudioSegment
5
+ import torchaudio
6
+ from pyannote.audio import Pipeline
7
 
8
+ diarization_pipe = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1",
9
+ use_auth_token="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") # change to "your-username/the-name-you-picked"
10
 
11
+
12
+
13
+ # Load the speech-to-text model (Whisper)
14
+ asr_pipe = pipeline("automatic-speech-recognition", model="SyedAunZaidi/whisper-small-hi")
15
+
16
+ def transcribe_with_diarization(audio_path):
17
+ # Get speaker segments using the diarization model
18
+ diarization_result = diarization_pipe(audio_path)
19
+
20
+ # Extract speaker segments and transcribe them using Whisper ASR
21
+ transcripts = []
22
+ for track, segment,speaker in diarization_result.itertracks(yield_label=True):
23
+
24
+ print(segment)
25
+ print(speaker)
26
+
27
+ start_time = track.start
28
+ end_time = track.end
29
+ print(start_time)
30
+ print(end_time)
31
+ label = segment # Extract the label manually
32
+ waveform, sample_rate = torchaudio.load("/content/drive/MyDrive/recording.mp3", normalize=True)
33
+ start_sample = int(start_time * sample_rate)
34
+ end_sample = int(end_time * sample_rate)
35
+ print(waveform)
36
+ interval_audio = waveform[:,start_sample:end_sample]
37
+ # Export the interval audio as a temporary WAV file
38
+ torchaudio.save("interval_audio.wav", interval_audio,sample_rate)
39
+ transcript = asr_pipe("interval_audio.wav")
40
+ print(transcript)
41
+ start_time = segment.start
42
+ end_time = segment.end
43
+ label = track[0].label() # Extract the label manually
44
+ speaker_audio = audio_path + f"[{start_time:.2f},{end_time:.2f}]"
45
+ transcript = asr_pipe(speaker_audio)[0]["text"]
46
+ transcripts.append(transcript)
47
+
48
+ # Combine the transcriptions from all speakers
49
+ text = " ".join(transcripts)
50
+ return text
51
 
52
  iface = gr.Interface(
53
+ fn=transcribe_with_diarization,
54
+ inputs=gr.Audio(source="microphone", type="filepath", filetype="wav"),
55
  outputs="text",
56
+ title="Whisper Large Hindi with Speaker Diarization",
57
+ description="Real-time demo for Hindi speech recognition using a fine-tuned Whisper large model with speaker diarization.",
58
  )
59
 
60
+ iface.launch()