Spaces:
Runtime error
Runtime error
File size: 2,122 Bytes
b0b1ade c009378 b0b1ade c009378 b0b1ade c009378 2ee0f99 c009378 04bdae1 c009378 64bceb8 c5fe8de c009378 b0b1ade c009378 37ad5f1 c5fe8de 37ad5f1 b0b1ade c5fe8de c009378 b0b1ade c009378 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
from transformers import pipeline
import gradio as gr
from pyannote.core import Annotation
from pydub import AudioSegment
import torchaudio
from pyannote.audio import Pipeline
diarization_pipe = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1",
use_auth_token="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") # change to "your-username/the-name-you-picked"
# Load the speech-to-text model (Whisper)
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
def transcribe_with_diarization(audio_path):
# Get speaker segments using the diarization model
diarization_result = diarization_pipe(audio_path)
# Extract speaker segments and transcribe them using Whisper ASR
transcripts = []
for track, segment,speaker in diarization_result.itertracks(yield_label=True):
print(segment)
print(speaker)
start_time = track.start
end_time = track.end
print(start_time)
print(end_time)
label = segment # Extract the label manually
waveform, sample_rate = torchaudio.load(audio_path, normalize=True)
start_sample = int(start_time * sample_rate)
end_sample = int(end_time * sample_rate)
print(waveform)
interval_audio = waveform[:,start_sample:end_sample]
# Export the interval audio as a temporary WAV file
torchaudio.save("interval_audio.wav", interval_audio,sample_rate)
transcript = asr_pipe("interval_audio.wav")
print(transcript)
transcripts.append(transcript)
# Combine the transcriptions from all speakers
text = " ".join(transcripts)
return text
iface = gr.Interface(
fn=transcribe_with_diarization,
inputs=[
gr.File(label="Audio File"),
gr.Audio(source="microphone", type="filepath", filetype="mp3")
],
outputs="text",
title="Whisper Large Hindi with Speaker Diarization",
description="Real-time demo for Hindi speech recognition using a fine-tuned Whisper large model with speaker diarization.",
)
iface.launch()
|