whisperaudio / app.py
Hunzla's picture
Update app.py
c5fe8de
raw
history blame
No virus
2.12 kB
from transformers import pipeline
import gradio as gr
from pyannote.core import Annotation
from pydub import AudioSegment
import torchaudio
from pyannote.audio import Pipeline
diarization_pipe = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1",
use_auth_token="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") # change to "your-username/the-name-you-picked"
# Load the speech-to-text model (Whisper)
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
def transcribe_with_diarization(audio_path):
# Get speaker segments using the diarization model
diarization_result = diarization_pipe(audio_path)
# Extract speaker segments and transcribe them using Whisper ASR
transcripts = []
for track, segment,speaker in diarization_result.itertracks(yield_label=True):
print(segment)
print(speaker)
start_time = track.start
end_time = track.end
print(start_time)
print(end_time)
label = segment # Extract the label manually
waveform, sample_rate = torchaudio.load(audio_path, normalize=True)
start_sample = int(start_time * sample_rate)
end_sample = int(end_time * sample_rate)
print(waveform)
interval_audio = waveform[:,start_sample:end_sample]
# Export the interval audio as a temporary WAV file
torchaudio.save("interval_audio.wav", interval_audio,sample_rate)
transcript = asr_pipe("interval_audio.wav")
print(transcript)
transcripts.append(transcript)
# Combine the transcriptions from all speakers
text = " ".join(transcripts)
return text
iface = gr.Interface(
fn=transcribe_with_diarization,
inputs=[
gr.File(label="Audio File"),
gr.Audio(source="microphone", type="filepath", filetype="mp3")
],
outputs="text",
title="Whisper Large Hindi with Speaker Diarization",
description="Real-time demo for Hindi speech recognition using a fine-tuned Whisper large model with speaker diarization.",
)
iface.launch()