whisperaudio / app.py
Hunzla's picture
Update app.py
c009378
raw
history blame
2.37 kB
from transformers import pipeline
import gradio as gr
from pyannote.core import Annotation
from pydub import AudioSegment
import torchaudio
from pyannote.audio import Pipeline
diarization_pipe = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1",
use_auth_token="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") # change to "your-username/the-name-you-picked"
# Load the speech-to-text model (Whisper)
asr_pipe = pipeline("automatic-speech-recognition", model="SyedAunZaidi/whisper-small-hi")
def transcribe_with_diarization(audio_path):
# Get speaker segments using the diarization model
diarization_result = diarization_pipe(audio_path)
# Extract speaker segments and transcribe them using Whisper ASR
transcripts = []
for track, segment,speaker in diarization_result.itertracks(yield_label=True):
print(segment)
print(speaker)
start_time = track.start
end_time = track.end
print(start_time)
print(end_time)
label = segment # Extract the label manually
waveform, sample_rate = torchaudio.load("/content/drive/MyDrive/recording.mp3", normalize=True)
start_sample = int(start_time * sample_rate)
end_sample = int(end_time * sample_rate)
print(waveform)
interval_audio = waveform[:,start_sample:end_sample]
# Export the interval audio as a temporary WAV file
torchaudio.save("interval_audio.wav", interval_audio,sample_rate)
transcript = asr_pipe("interval_audio.wav")
print(transcript)
start_time = segment.start
end_time = segment.end
label = track[0].label() # Extract the label manually
speaker_audio = audio_path + f"[{start_time:.2f},{end_time:.2f}]"
transcript = asr_pipe(speaker_audio)[0]["text"]
transcripts.append(transcript)
# Combine the transcriptions from all speakers
text = " ".join(transcripts)
return text
iface = gr.Interface(
fn=transcribe_with_diarization,
inputs=gr.Audio(source="microphone", type="filepath", filetype="wav"),
outputs="text",
title="Whisper Large Hindi with Speaker Diarization",
description="Real-time demo for Hindi speech recognition using a fine-tuned Whisper large model with speaker diarization.",
)
iface.launch()