Spaces:
Runtime error
Runtime error
File size: 2,374 Bytes
b0b1ade c009378 b0b1ade c009378 b0b1ade c009378 b0b1ade c009378 b0b1ade c009378 b0b1ade c009378 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
from transformers import pipeline
import gradio as gr
from pyannote.core import Annotation
from pydub import AudioSegment
import torchaudio
from pyannote.audio import Pipeline
diarization_pipe = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1",
use_auth_token="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") # change to "your-username/the-name-you-picked"
# Load the speech-to-text model (Whisper)
asr_pipe = pipeline("automatic-speech-recognition", model="SyedAunZaidi/whisper-small-hi")
def transcribe_with_diarization(audio_path):
# Get speaker segments using the diarization model
diarization_result = diarization_pipe(audio_path)
# Extract speaker segments and transcribe them using Whisper ASR
transcripts = []
for track, segment,speaker in diarization_result.itertracks(yield_label=True):
print(segment)
print(speaker)
start_time = track.start
end_time = track.end
print(start_time)
print(end_time)
label = segment # Extract the label manually
waveform, sample_rate = torchaudio.load("/content/drive/MyDrive/recording.mp3", normalize=True)
start_sample = int(start_time * sample_rate)
end_sample = int(end_time * sample_rate)
print(waveform)
interval_audio = waveform[:,start_sample:end_sample]
# Export the interval audio as a temporary WAV file
torchaudio.save("interval_audio.wav", interval_audio,sample_rate)
transcript = asr_pipe("interval_audio.wav")
print(transcript)
start_time = segment.start
end_time = segment.end
label = track[0].label() # Extract the label manually
speaker_audio = audio_path + f"[{start_time:.2f},{end_time:.2f}]"
transcript = asr_pipe(speaker_audio)[0]["text"]
transcripts.append(transcript)
# Combine the transcriptions from all speakers
text = " ".join(transcripts)
return text
iface = gr.Interface(
fn=transcribe_with_diarization,
inputs=gr.Audio(source="microphone", type="filepath", filetype="wav"),
outputs="text",
title="Whisper Large Hindi with Speaker Diarization",
description="Real-time demo for Hindi speech recognition using a fine-tuned Whisper large model with speaker diarization.",
)
iface.launch()
|